1 #'This function calculates predicted probabilities for 2 #'"observed" cases after a Bayesian logit or probit model 3 #'following Hanmer and Kalkan (2013, American Journal of 4 #'Political Science 57(1): 263-277) 5 #'@title Predicted Probabilities using Bayesian MCMC estimates for the Average of Observed Cases 6 #'@description Implements R function to calculate the predicted probabilities 7 #'for "observed" cases after a Bayesian logit or probit model, following 8 #'Hanmer & Kalkan (2013) (2013, American Journal of Political Science 57(1): 263-277). 9 #'@param modelmatrix model matrix, including intercept (if the intercept is among the 10 #'parameters estimated in the model). Create with model.matrix(formula, data). 11 #'Note: the order of columns in the model matrix must correspond to the order of columns 12 #'in the matrix of posterior draws in the \code{mcmcout} argument. See the \code{mcmcout} 13 #'argument for more. 14 #'@param mcmcout posterior distributions of all logit coefficients, 15 #'in matrix form. This can be created from rstan, MCMCpack, R2jags, etc. and transformed 16 #'into a matrix using the function as.mcmc() from the coda package for \code{jags} class 17 #'objects, as.matrix() from base R for \code{mcmc}, \code{mcmc.list}, \code{stanreg}, and 18 #'\code{stanfit} class objects, and \code{object$sims.matrix} for \code{bugs} class objects. 19 #'Note: the order of columns in this matrix must correspond to the order of columns 20 #'in the model matrix. One can do this by examining the posterior distribution matrix and sorting the 21 #'variables in the order of this matrix when creating the model matrix. A useful function for sorting 22 #'column names containing both characters and numbers as 23 #'you create the matrix of posterior distributions is \code{mixedsort()} from the gtools package. 24 #'@param xcol column number of the posterior draws (\code{mcmcout}) and model matrices 25 #'that corresponds to the explanatory variable for which to calculate associated Pr(y = 1). 26 #'Note that the columns in these matrices must match. 27 #'@param xrange name of the vector with the range of relevant values of the 28 #'explanatory variable for which to calculate associated Pr(y = 1). 29 #'@param xinterest semi-optional argument. Name of the explanatory variable for which 30 #'to calculate associated Pr(y = 1). If \code{xcol} is supplied, this is not needed. 31 #'If both are supplied, the function defaults to \code{xcol} and this argument is ignored. 32 #'@param link type of generalized linear model; a character vector set to \code{"logit"} (default) 33 #'or \code{"probit"}. 34 #'@param ci the bounds of the credible interval. Default is \code{c(0.025, 0.975)} for the 95\% 35 #'credible interval. 36 #'@param fullsims logical indicator of whether full object (based on all MCMC draws 37 #'rather than their average) will be returned. Default is \code{FALSE}. Note: The longer 38 #'\code{xrange} is, the larger the full output will be if \code{TRUE} is selected. 39 #'@references Hanmer, Michael J., & Ozan Kalkan, K. (2013). Behind the curve: Clarifying 40 #'the best approach to calculating predicted probabilities and marginal effects from 41 #'limited dependent variable models. American Journal of Political Science, 57(1), 42 #'263-277. https://doi.org/10.1111/j.1540-5907.2012.00602.x 43 #'@return if \code{fullsims = FALSE} (default), a tibble with 4 columns: 44 #'\itemize{ 45 #'\item x: value of variable of interest, drawn from \code{xrange} 46 #'\item median_pp: median predicted Pr(y = 1) when variable of interest is set to x 47 #'\item lower_pp: lower bound of credible interval of predicted probability at given x 48 #'\item upper_pp: upper bound of credible interval of predicted probability at given x 49 #'} 50 #'if \code{fullsims = TRUE}, a tibble with 3 columns: 51 #'\itemize{ 52 #'\item Iteration: number of the posterior draw 53 #'\item x: value of variable of interest, drawn from \code{xrange} 54 #'\item pp: average predicted Pr(y = 1) of all observed cases when variable of interest is set to x 55 #'} 56 #'@examples 57 #' \dontshow{.old_wd <- setwd(tempdir())} 58 #' \donttest{ 59 #' ## simulating data 60 #' set.seed(123456) 61 #' b0 <- 0.2 # true value for the intercept 62 #' b1 <- 0.5 # true value for first beta 63 #' b2 <- 0.7 # true value for second beta 64 #' n <- 500 # sample size 65 #' X1 <- runif(n, -1, 1) 66 #' X2 <- runif(n, -1, 1) 67 #' Z <- b0 + b1 * X1 + b2 * X2 68 #' pr <- 1 / (1 + exp(-Z)) # inv logit function 69 #' Y <- rbinom(n, 1, pr) 70 #' df <- data.frame(cbind(X1, X2, Y)) 71 #' 72 #' ## formatting the data for jags 73 #' datjags <- as.list(df) 74 #' datjags$N <- length(datjags$Y) 75 #' 76 #' ## creating jags model 77 #' model <- function() { 78 #' 79 #' for(i in 1:N){ 80 #' Y[i] ~ dbern(p[i]) ## Bernoulli distribution of y_i 81 #' logit(p[i]) <- mu[i] ## Logit link function 82 #' mu[i] <- b[1] + 83 #' b[2] * X1[i] + 84 #' b[3] * X2[i] 85 #' } 86 #' 87 #' for(j in 1:3){ 88 #' b[j] ~ dnorm(0, 0.001) ## Use a coefficient vector for simplicity 89 #' } 90 #' 91 #'} 92 #' 93 #' params <- c("b") 94 #' inits1 <- list("b" = rep(0, 3)) 95 #' inits2 <- list("b" = rep(0, 3)) 96 #' inits <- list(inits1, inits2) 97 #' 98 #' ## fitting the model with R2jags 99 #' library(R2jags) 100 #' set.seed(123) 101 #' fit <- jags(data = datjags, inits = inits, 102 #' parameters.to.save = params, n.chains = 2, n.iter = 2000, 103 #' n.burnin = 1000, model.file = model) 104 #' 105 #' ### observed value approach 106 #' library(coda) 107 #' xmat <- model.matrix(Y ~ X1 + X2, data = df) 108 #' mcmc <- as.mcmc(fit) 109 #' mcmc_mat <- as.matrix(mcmc)[, 1:ncol(xmat)] 110 #' X1_sim <- seq(from = min(datjags$X1), 111 #' to = max(datjags$X1), 112 #' length.out = 10) 113 #' obs_prob <- mcmcObsProb(modelmatrix = xmat, 114 #' mcmcout = mcmc_mat, 115 #' xrange = X1_sim, 116 #' xcol = 2) 117 #' } 118 #' 119 #' \dontshow{setwd(.old_wd)} 120 #'@export 121 #' 122 mcmcObsProb <- function(modelmatrix, 123 mcmcout, 124 xcol, 125 xrange, 126 xinterest, 127 link = "logit", 128 ci = c(0.025, 0.975), 129 fullsims = FALSE){ 130 131 # checking arguments 132 1 if(missing(xcol) & missing(xinterest)) { 133 0 stop("Please enter a column number or name of your variable of interest)") 134 } 135 1 if(!missing(xcol) & !missing(xinterest)) { 136 0 message("Both xcol and xinterest were supplied by user. Function defaults to xcol") 137 } 138 1 if(!missing(xinterest)) { 139 0 if(!(xinterest %in% variable.names(modelmatrix))) 140 0 stop("Variable name does not match any in the matrix. Please enter another.") 141 } 142 143 1 X <- matrix(rep(t(modelmatrix), length(xrange)), 144 1 ncol = ncol(modelmatrix), byrow = TRUE ) 145 1 colnames(X) <- variable.names(modelmatrix) 146 1 if(!missing(xcol)) { 147 1 X[, xcol] <- sort(rep(xrange, times = nrow(X) / length(xrange))) 148 } else { 149 0 X[ , grepl( xinterest , variable.names( X ) ) ] <- 150 0 sort(rep(xrange, times = nrow(X) / length(xrange))) 151 } 152 153 1 if(link == "logit"){ 154 1 pp <- plogis(t(X %*% t(mcmcout))) 155 } 156 157 1 if(link == "probit"){ 158 1 pp <- pnorm(t(X %*% t(mcmcout))) 159 } 160 161 162 # emptry matrix for PPs 163 1 pp_mat <- matrix(NA, nrow = nrow(mcmcout), ncol = length(xrange)) 164 165 # indices 166 1 pp_mat_lowerindex <- 1 + (0:(length(xrange) - 1) * nrow(modelmatrix)) 167 1 pp_mat_upperindex <- nrow(modelmatrix) + (0:(length(xrange) - 1) * 168 1 nrow(modelmatrix)) 169 170 171 # fill matrix with PPs, one for each value of the predictor of interest 172 1 for(i in 1:length(xrange)){ 173 1 pp_mat[, i] <- apply(X = pp[, 174 1 c(pp_mat_lowerindex[i]:pp_mat_upperindex[i])], 175 1 MARGIN = 1, FUN = function(x) mean(x)) 176 } 177 178 1 median_pp <- apply(X = pp_mat, MARGIN = 2, function(x) quantile(x, probs = c(0.5))) 179 1 lower_pp <- apply(X = pp_mat, MARGIN = 2, function(x) quantile(x, probs = ci[1])) 180 1 upper_pp <- apply(X = pp_mat, MARGIN = 2, function(x) quantile(x, probs = ci[2])) 181 182 1 pp_dat <- dplyr::tibble(x = xrange, 183 1 median_pp = median_pp, 184 1 lower_pp = lower_pp, 185 1 upper_pp = upper_pp) 186 187 1 if(fullsims == FALSE){ 188 1 return(pp_dat) # pp_dat was created by summarizing longFrame 189 } 190 191 1 if(fullsims == TRUE){ 192 1 longFrame <- reshape2::melt(pp_mat, id.vars = .data$Var2) 193 1 names(longFrame) <- c("Iteration", "x", "pp") 194 1 return(longFrame) 195 } 196 197 }

Read our documentation on viewing source code .