ewenharrison / finalfit
1
#' Bootstrap simulation for model prediction
2
#'
3
#' Generate model predictions against a specified set of explanatory levels with
4
#' bootstrapped confidence intervals. Add a comparison by difference or ratio of
5
#' the first row of \code{newdata} with all subsequent rows.
6
#'
7
#' To use this, first generate \code{newdata} for specified levels of
8
#' explanatory variables using \code{\link{finalfit_newdata}}. Pass model
9
#' objects from \code{lm}, \code{glm}, \code{\link{lmmulti}}, and
10
#' \code{\link{glmmulti}}. The comparison metrics are made on individual
11
#' bootstrap samples distribution returned as a mean with confidence intervals.
12
#' A p-value is generated on the proportion of values on the other side of the
13
#' null from the mean, e.g. for a ratio greater than 1.0, p is the number of
14
#' bootstrapped predictions under 1.0, multiplied by two so is two-sided.
15
#'
16
#' @param fit A model generated using \code{lm}, \code{glm},
17
#'   \code{\link{lmmulti}}, and \code{\link{glmmulti}}.
18
#' @param newdata Dataframe usually generated with
19
#'   \code{\link{finalfit_newdata}}.
20
#' @param type the type of prediction required, see
21
#'   \code{\link[stats]{predict.glm}}. The default for glm models is on the
22
#'   scale of the response variable. Thus for a binomial model the default
23
#'   predictions are predicted probabilities.
24
#' @param R Number of simulations. Note default R=100 is very low.
25
#' @param estimate_name Name to be given to prediction variable y-hat.
26
#' @param confint_sep String separating lower and upper confidence interval
27
#' @param condense Logical. FALSE gives numeric values, usually for plotting.
28
#'   TRUE gives table for final output.
29
#' @param boot_compare Include a comparison with the first row of \code{newdata}
30
#'   with all subsequent rows. See \code{\link{boot_compare}}.
31
#' @param compare_name Name to be given to comparison metric.
32
#' @param comparison Either "difference" or "ratio".
33
#' @param ref_symbol Reference level symbol
34
#' @param digits Rounding for estimate values and p-values, default c(2,3).
35
#' @return A dataframe of predicted values and confidence intervals, with the
36
#'   option of including a comparison of difference between first row and all
37
#'   subsequent rows of \code{newdata}.
38
#'
39
#' @seealso \link{finalfit_newdata}
40
#'
41
#' /code{finalfit} predict functions
42
#' @export
43
#' @importFrom broom tidy
44
#' @importFrom boot boot
45
#' @importFrom stats predict
46
#'
47
#' @examples
48
#' library(finalfit)
49
#' library(dplyr)
50
#'
51
#' # Predict probability of death across combinations of factor levels
52
#' explanatory = c("age.factor", "extent.factor", "perfor.factor")
53
#' dependent = 'mort_5yr'
54
#'
55
#' # Generate combination of factor levels
56
#' colon_s %>%
57
#'   finalfit_newdata(explanatory = explanatory, newdata = list(
58
#'     c("<40 years",  "Submucosa", "No"),
59
#'     c("<40 years", "Submucosa", "Yes"),
60
#'     c("<40 years", "Adjacent structures", "No"),
61
#'     c("<40 years", "Adjacent structures", "Yes")
62
#'    )) -> newdata
63
#'
64
#' # Run simulation
65
#' colon_s %>%
66
#'   glmmulti(dependent, explanatory) %>%
67
#'   boot_predict(newdata, estimate_name = "Predicted probability of death",
68
#'     compare_name = "Absolute risk difference", R=100, digits = c(2,3))
69
#'
70
#' # Plotting
71
#' explanatory = c("nodes", "extent.factor", "perfor.factor")
72
#' colon_s %>%
73
#'   finalfit_newdata(explanatory = explanatory, rowwise = FALSE, newdata = list(
74
#'   rep(seq(0, 30), 4),
75
#'   c(rep("Muscle", 62), rep("Adjacent structures", 62)),
76
#'   c(rep("No", 31), rep("Yes", 31), rep("No", 31), rep("Yes", 31))
77
#' )) -> newdata
78
#'
79
#' colon_s %>%
80
#'   glmmulti(dependent, explanatory) %>%
81
#'   boot_predict(newdata, boot_compare = FALSE, R=100, condense=FALSE) -> plot
82
#'
83
#'   library(ggplot2)
84
#'   theme_set(theme_bw())
85
#'   plot %>%
86
#'     ggplot(aes(x = nodes, y = estimate, ymin = estimate_conf.low,
87
#'         ymax = estimate_conf.high, fill=extent.factor))+
88
#'       geom_line(aes(colour = extent.factor))+
89
#'       geom_ribbon(alpha=0.1)+
90
#'       facet_grid(.~perfor.factor)+
91
#'       xlab("Number of postive lymph nodes")+
92
#'       ylab("Probability of death")+
93
#'       labs(fill = "Extent of tumour", colour = "Extent of tumour")+
94
#'       ggtitle("Probability of death by lymph node count")
95

96
boot_predict = function (fit, newdata, type = "response", R = 100,
97
                         estimate_name = NULL,
98
                         confint_sep = " to ", condense=TRUE, boot_compare = TRUE,
99
                         compare_name = NULL, comparison = "difference", ref_symbol = "-",
100
                         digits = c(2, 3)){
101 1
  fit_class = attr(fit, "class")[1]
102

103
  # Ensure lmlist | glmlist objects are length == 1
104 1
  if(fit_class %in% c("lmlist", "glmlist") & length(fit) > 1){
105 0
    stop("Multiple models in fit, must be single model")}
106

107
  # Unlist lmlist & glmlist objects
108 0
  if(fit_class %in% c("lmlist", "glmlist")) fit = fit[[1]]
109 1
  fit_class = attr(fit, "class")[1]
110

111
  # Stop if not lm | glm
112 0
  if(!any(fit_class %in% c("lm", "glm"))) stop("fit must contain an lm or glm model")
113

114
  # Stop if newdata not dataframe
115 0
  if(!is.data.frame(newdata)) stop("Must provide dataframe with new data, see examples")
116

117 1
  if(is.null(estimate_name)) estimate_name = "estimate"
118

119 1
  formula = fit$terms
120 1
  family = fit$family$family
121 1
  link = fit$family$link
122 1
  if(is.null(family)) {
123 0
    family = substitute(gaussian)
124
  }else{
125 1
    family = match.fun(family)
126 1
    family = substitute(family(link=link))
127
  }
128 1
  .data = fit$model
129

130
  # Statistic function to bootstrap
131 1
  statistic = function(formula, family, .data, indices) {
132 1
    d = .data[indices, ]
133 1
    fit_boot = glm(formula=formula, family = eval(family), data = d)
134 1
    out = predict(fit_boot, newdata=newdata, type=type)
135 1
    return(out)
136
  }
137

138
  # Run bootstrap
139 1
  bs.out = boot::boot(data = .data, statistic = statistic, R = R,
140 1
                        formula = formula, family = family)
141

142 1
  bs.tidy = broom::tidy(bs.out, conf.int = TRUE, conf.level = 0.95, conf.method = "perc") #Future options
143 1
  bs.tidy = data.frame(bs.tidy)
144

145

146 1
  if(!condense){
147 1
    df.out = bs.tidy[, c(1, 4,5)]
148 1
    colnames(df.out) = c(estimate_name, paste0(estimate_name, "_conf.low"), paste0(estimate_name, "_conf.high"))
149
  } else {
150 1
    bs.tidy %>%
151 1
      dplyr::mutate_all(round_tidy, digits = digits[1]) -> df.out
152 1
    df.out = data.frame(
153 1
      paste0(df.out$statistic, " (", df.out$conf.low, confint_sep, df.out$conf.high, ")"))
154 1
    colnames(df.out) = estimate_name
155
  }
156

157

158 1
  if(boot_compare){
159 1
    bc.out = boot_compare(bs.out, confint_sep = confint_sep, comparison = comparison,
160 1
                          condense = condense,
161 1
                          compare_name = compare_name, digits=digits,
162 1
                          ref_symbol = ref_symbol)
163 1
    df.out = cbind(df.out, bc.out)
164
  }
165

166

167
  # Final table
168 1
  if(condense){
169 1
    labels = extract_variable_label(newdata)
170 1
    names(newdata) = labels
171
  }
172 1
  df.out = cbind(newdata, df.out)
173 1
  return(df.out)
174
}

Read our documentation on viewing source code .

Loading