1
# Bootstrap confidence interval code
2

3
# ------------------------------------------------------------------------------
4
# helpers
5

6

7
check_rset <- function(x, app = TRUE) {
8 1
  if (!inherits(x, "bootstraps"))
9 1
    stop("`.data` should be an `rset` object generated from `bootstraps()`",
10 1
         call. = FALSE)
11

12 1
  if (app) {
13 1
    if(x %>% dplyr::filter(id == "Apparent") %>% nrow() != 1)
14 0
      stop("Please set `apparent = TRUE` in `bootstraps()` function",
15 0
           call. = FALSE)
16
  }
17 1
  invisible(NULL)
18
}
19

20

21
stat_fmt_err <- paste("`statistics` should select a list column of tidy results.")
22
stat_nm_err <- paste("The tibble in `statistics` should have columns for",
23
                     "'estimate' and 'term`")
24
std_exp <- c("std.error", "robust.se")
25

26
check_tidy_names <- function(x, std_col) {
27
  # check for proper columns
28 1
  if (sum(colnames(x) == "estimate") != 1) {
29 1
    stop(stat_nm_err, call. = FALSE)
30
  }
31 1
  if (sum(colnames(x) == "term") != 1) {
32 1
    stop(stat_nm_err, call. = FALSE)
33
  }
34 1
  if (std_col) {
35 1
    std_candidates <- colnames(x) %in% std_exp
36 1
    if (sum(std_candidates) != 1) {
37 1
      stop("`statistics` should select a single column for the standard ",
38 1
           "error.", call. = FALSE)
39
    }
40
  }
41 1
  invisible(TRUE)
42
}
43

44
check_tidy <- function(x, std_col = FALSE) {
45 1
  if (!is.list(x)) {
46 1
    stop(stat_fmt_err, call. = FALSE)
47
  }
48

49
  # convert to data frame from list
50 1
  has_id <- any(names(x) == "id")
51 1
  if (has_id) {
52 1
    list_cols <- names(x)[map_lgl(x, is_list)]
53 1
    x <- try(tidyr::unnest(x, cols = list_cols), silent = TRUE)
54
  } else {
55 1
    x <- try(map_dfr(x, ~ .x), silent = TRUE)
56
  }
57

58 1
  if (inherits(x, "try-error")) {
59 0
    stop(stat_fmt_err, call. = FALSE)
60
  }
61

62 1
  check_tidy_names(x, std_col)
63

64 1
  if (std_col) {
65 1
    std_candidates <- colnames(x) %in% std_exp
66 1
    std_candidates <- colnames(x)[std_candidates]
67 1
    if (has_id) {
68 1
      x <-
69 1
        dplyr::select(x, term, estimate, id, tidyselect::one_of(std_candidates)) %>%
70 1
        mutate(id = (id == "Apparent")) %>%
71 1
        setNames(c("term", "estimate", "orig", "std_err"))
72
    } else {
73 0
      x <-
74 0
        dplyr::select(x, term, estimate, tidyselect::one_of(std_candidates)) %>%
75 0
        setNames(c("term", "estimate", "std_err"))
76
    }
77

78
  } else {
79 1
    if (has_id) {
80 1
      x <-
81 1
        dplyr::select(x, term, estimate, id) %>%
82 1
        mutate(orig = (id == "Apparent")) %>%
83 1
        dplyr::select(-id)
84
    } else {
85 1
      x <- dplyr::select(x, term, estimate)
86
    }
87
  }
88

89 1
  x
90
}
91

92

93
get_p0 <- function(x, alpha = 0.05) {
94 1
  orig <- x %>%
95 1
    group_by(term) %>%
96 1
    dplyr::filter(orig) %>%
97 1
    dplyr::select(term, theta_0 = estimate) %>%
98 1
    ungroup()
99 1
  x %>%
100 1
    dplyr::filter(!orig) %>%
101 1
    inner_join(orig, by = "term") %>%
102 1
    group_by(term) %>%
103 1
    summarize(p0 = mean(estimate <= theta_0, na.rm = TRUE)) %>%
104 1
    mutate(Z0 = stats::qnorm(p0),
105 1
           Za = stats::qnorm(1 - alpha / 2))
106
}
107

108
new_stats <- function(x, lo, hi) {
109 1
  res <- as.numeric(quantile(x, probs = c(lo, hi), na.rm = TRUE))
110 1
  tibble(.lower = min(res), .estimate = mean(x, na.rm = TRUE), .upper = max(res))
111
}
112

113
has_dots <- function(x) {
114 1
  nms <- names(formals(x))
115 1
  if (!any(nms == "...")) {
116 1
    stop("`.fn` must have an argument `...`.", call. = FALSE)
117
  }
118 1
  invisible(NULL)
119
}
120

121
check_num_resamples <- function(x, B = 1000) {
122 1
  x <-
123 1
    x %>%
124 1
    dplyr::group_by(term) %>%
125 1
    dplyr::summarize(n = sum(!is.na(estimate))) %>%
126 1
    dplyr::filter(n < B)
127

128 1
  if (nrow(x) > 0) {
129 1
    terms <- paste0("`", x$term, "`")
130 1
    msg <-
131 1
      paste0(
132 1
        "Recommend at least ", B, " non-missing bootstrap resamples for ",
133 1
        ifelse(length(terms) > 1, "terms: ", "term "),
134 1
        paste0(terms, collapse = ", "),
135
        "."
136
      )
137 1
    warning(msg, call. = FALSE)
138
  }
139 1
  invisible(NULL)
140
}
141

142
# ------------------------------------------------------------------------------
143
# percentile code
144

145

146
pctl_single <- function(stats, alpha = 0.05) {
147

148 1
  if (all(is.na(stats)))
149 1
    stop("All statistics have missing values..", call. = FALSE)
150

151 1
  if (!is.numeric(stats))
152 1
    stop("`stats` must be a numeric vector.", call. = FALSE)
153

154
  # stats is a numeric vector of values
155 1
  ci <- stats %>% quantile(probs = c(alpha / 2, 1 - alpha / 2), na.rm = TRUE)
156

157
  # return a tibble with .lower, .estimate, .upper
158 1
  res <- tibble(
159 1
    .lower = min(ci),
160 1
    .estimate = mean(stats, na.rm = TRUE),
161 1
    .upper = max(ci),
162 1
    .alpha = alpha,
163 1
    .method = "percentile"
164
  )
165 1
  res
166
}
167

168
#' Bootstrap confidence intervals
169
#' @description
170
#' Calculate bootstrap confidence intervals using various methods.
171
#' @param .data A data frame containing the bootstrap resamples created using
172
#'  `bootstraps()`. For t- and BCa-intervals, the `apparent` argument
173
#'  should be set to `TRUE`. Even if the `apparent` argument is set to
174
#'  `TRUE` for the percentile method, the apparent data is never used in calculating
175
#'  the percentile confidence interval.
176
#' @param statistics An unquoted column name or `dplyr` selector that identifies
177
#'  a single column in the data set that contains the individual bootstrap
178
#'  estimates. This can be a list column of tidy tibbles (that contains columns
179
#'  `term` and `estimate`) or a simple numeric column. For t-intervals, a
180
#'  standard tidy column (usually called `std.err`) is required.
181
#'  See the examples below.
182
#' @param alpha Level of significance
183
#' @return Each function returns a tibble with columns `.lower`,
184
#'  `.estimate`, `.upper`, `.alpha`, `.method`, and `term`.
185
#'  `.method` is the type of interval (eg. "percentile",
186
#'  "student-t", or "BCa"). `term` is the name of the estimate. Note
187
#'  the `.estimate` returned from `int_pctl()`
188
#'  is the mean of the estimates from the bootstrap resamples
189
#'  and not the estimate from the apparent model.
190
#' @details Percentile intervals are the standard method of
191
#'  obtaining confidence intervals but require thousands of
192
#'  resamples to be accurate. T-intervals may need fewer
193
#'  resamples but require a corresponding variance estimate.
194
#'  Bias-corrected and accelerated intervals require the original function
195
#'  that was used to create the statistics of interest and are
196
#'  computationally taxing.
197
#'
198
#' @references Davison, A., & Hinkley, D. (1997). _Bootstrap Methods and their
199
#'  Application_. Cambridge: Cambridge University Press.
200
#'  doi:10.1017/CBO9780511802843
201
#'
202
#' @examples
203
#' library(broom)
204
#' library(dplyr)
205
#' library(purrr)
206
#' library(tibble)
207
#'
208
#' lm_est <- function(split, ...) {
209
#'   lm(mpg ~ disp + hp, data = analysis(split)) %>%
210
#'     tidy()
211
#' }
212
#'
213
#' set.seed(52156)
214
#' car_rs <-
215
#'   bootstraps(mtcars, 500, apparent = TRUE) %>%
216
#'   mutate(results = map(splits, lm_est))
217
#'
218
#' int_pctl(car_rs, results)
219
#' int_t(car_rs, results)
220
#' int_bca(car_rs, results, .fn = lm_est)
221
#'
222
#' # putting results into a tidy format
223
#' rank_corr <- function(split) {
224
#'   dat <- analysis(split)
225
#'   tibble(
226
#'     term = "corr",
227
#'     estimate = cor(dat$sqft, dat$price, method = "spearman"),
228
#'     # don't know the analytical std.err so no t-intervals
229
#'     std.err = NA_real_
230
#'   )
231
#' }
232
#'
233
#' set.seed(69325)
234
#' data(Sacramento, package = "modeldata")
235
#' bootstraps(Sacramento, 1000, apparent = TRUE) %>%
236
#'   mutate(correlations = map(splits, rank_corr)) %>%
237
#'   int_pctl(correlations)
238
#' @export
239
int_pctl <- function(.data, statistics, alpha = 0.05) {
240

241 1
  check_rset(.data, app = FALSE)
242

243 1
  .data <- .data %>% dplyr::filter(id != "Apparent")
244

245 1
  column_name <- tidyselect::vars_select(names(.data), !!rlang::enquo(statistics))
246 1
  if (length(column_name) != 1) {
247 0
    stop(stat_fmt_err, call. = FALSE)
248
  }
249 1
  stats <- .data[[column_name]]
250 1
  stats <- check_tidy(stats, std_col = FALSE)
251

252 1
  check_num_resamples(stats, B = 1000)
253

254 1
  vals <- stats %>%
255 1
    dplyr::group_by(term) %>%
256 1
    dplyr::do(pctl_single(.$estimate, alpha = alpha)) %>%
257 1
    dplyr::ungroup()
258 1
  vals
259

260
}
261

262
# ------------------------------------------------------------------------------
263
# t interval code
264

265
t_single <- function(stats, std_err, is_orig, alpha = 0.05) {
266
  # stats is a numeric vector of values
267
  # vars is a numeric vector of variances
268
  # return a tibble with .lower, .estimate, .upper
269
  # which_orig is the index of stats and std_err that has the original result
270

271 1
  if (all(is.na(stats)))
272 1
    stop("All statistics have missing values.", call. = FALSE)
273

274 1
  if (!is.logical(is_orig) || any(is.na(is_orig))) {
275 0
    stop("`is_orig` should be a logical column the same length as `stats` ",
276 0
         "with no missing values.", call. = FALSE)
277
  }
278 1
  if (length(stats) != length(std_err) && length(stats) != length(is_orig)) {
279 0
    stop("`stats`, `std_err`, and `is_orig` should have the same length.",
280 0
         call. = FALSE)
281
  }
282 1
  if (sum(is_orig) != 1) {
283 0
    stop("The original statistic must be in a single row.", call. = FALSE)
284
  }
285

286 1
  theta_obs   <- stats[is_orig]
287 1
  std_err_obs <- std_err[is_orig]
288

289 1
  stats   <- stats[!is_orig]
290 1
  std_err <- std_err[!is_orig]
291

292 1
  z_dist <-
293 1
    (stats - theta_obs) / std_err
294

295 1
  z_pntl <-
296 1
    quantile(z_dist, probs = c(alpha / 2, 1 - (alpha) / 2), na.rm = TRUE)
297

298 1
  ci <- theta_obs - z_pntl * std_err_obs
299

300 1
  tibble(
301 1
    .lower = min(ci),
302 1
    .estimate = mean(stats, na.rm = TRUE),
303 1
    .upper = max(ci),
304 1
    .alpha = alpha,
305 1
    .method = "student-t"
306
  )
307
}
308

309

310
#' @rdname int_pctl
311
#' @export
312
int_t <- function(.data, statistics, alpha = 0.05) {
313

314 1
  check_rset(.data)
315

316 1
  column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics))
317 1
  if (length(column_name) != 1) {
318 0
    stop(stat_fmt_err, call. = FALSE)
319
  }
320 1
  stats <- .data %>% dplyr::select(!!column_name, id)
321 1
  stats <- check_tidy(stats, std_col = TRUE)
322

323 1
  check_num_resamples(stats, B = 500)
324

325 1
  vals <-
326 1
    stats %>%
327 1
    dplyr::group_by(term) %>%
328 1
    dplyr::do(t_single(.$estimate, .$std_err, .$orig, alpha = alpha)) %>%
329 1
    dplyr::ungroup()
330 1
  vals
331
}
332

333

334
# ----------------------------------------------------------------
335

336
bca_calc <- function(stats, orig_data, alpha = 0.05, .fn, ...) {
337

338
  # TODO check per term
339 1
  if (all(is.na(stats$estimate))) {
340 1
    stop("All statistics have missing values.", call. = FALSE)
341
  }
342

343
  ### Estimating Z0 bias-correction
344 1
  bias_corr_stats <- get_p0(stats, alpha = alpha)
345

346
  # need the original data frame here
347 1
  loo_rs <- loo_cv(orig_data)
348

349
  # We can't be sure what we will get back from the analysis function.
350
  # To test, we run on the first LOO data set and see if it is a vector or df
351 1
  loo_test <- try(rlang::exec(.fn, loo_rs$splits[[1]], ...), silent = TRUE)
352 1
  if (inherits(loo_test, "try-error")) {
353 0
    cat("Running `.fn` on the LOO resamples produced an error:\n")
354 0
    print(loo_test)
355 0
    stop("`.fn` failed.", call. = FALSE)
356
  }
357

358 1
  loo_res <- furrr::future_map_dfr(loo_rs$splits, .fn, ...)
359

360 1
  loo_estimate <-
361 1
    loo_res %>%
362 1
    dplyr::group_by(term) %>%
363 1
    dplyr::summarize(loo = mean(estimate, na.rm = TRUE)) %>%
364 1
    dplyr::inner_join(loo_res, by = "term")  %>%
365 1
    dplyr::group_by(term) %>%
366 1
    dplyr::summarize(
367 1
      cubed = sum((loo - estimate)^3),
368 1
      squared = sum((loo - estimate)^2)
369
    ) %>%
370 1
    dplyr::ungroup() %>%
371 1
    dplyr::inner_join(bias_corr_stats, by = "term") %>%
372 1
    dplyr::mutate(
373 1
      a = cubed/(6 * (squared^(3 / 2))),
374 1
      Zu = (Z0 + Za) / ( 1 - a * (Z0 + Za)) + Z0,
375 1
      Zl = (Z0 - Za) / (1 - a * (Z0 - Za)) + Z0,
376 1
      lo = stats::pnorm(Zl, lower.tail = TRUE),
377 1
      hi = stats::pnorm(Zu, lower.tail = TRUE)
378
    )
379

380 1
  terms <- loo_estimate$term
381 1
  stats <- stats %>% dplyr::filter(!orig)
382 1
  for (i in seq_along(terms)) {
383 1
    tmp <- new_stats(stats$estimate[ stats$term == terms[i] ],
384 1
                     lo = loo_estimate$lo[i],
385 1
                     hi = loo_estimate$hi[i])
386 1
    tmp$term <- terms[i]
387 1
    if (i == 1) {
388 1
      ci_bca <- tmp
389
    } else {
390 1
      ci_bca <- bind_rows(ci_bca, tmp)
391
    }
392
  }
393 1
  ci_bca <-
394 1
    ci_bca %>%
395 1
    dplyr::select(term, .lower, .estimate, .upper) %>%
396 1
    dplyr::mutate(
397 1
      .alpha = alpha,
398 1
      .method = "BCa"
399
    )
400
}
401

402

403
#' @rdname int_pctl
404
#' @param .fn A function to calculate statistic of interest. The
405
#'  function should take an `rsplit` as the first argument and the `...` are
406
#'  required.
407
#' @param ... Arguments to pass to `.fn`.
408
#' @references \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html}
409
#' @export
410
int_bca <- function(.data, statistics, alpha = 0.05, .fn, ...) {
411

412 1
  check_rset(.data)
413

414 1
  has_dots(.fn)
415

416 1
  column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics))
417 1
  if (length(column_name) != 1) {
418 0
    stop(stat_fmt_err, call. = FALSE)
419
  }
420 1
  stats <- .data %>% dplyr::select(!!column_name, id)
421 1
  stats <- check_tidy(stats)
422

423 1
  check_num_resamples(stats, B = 1000)
424

425 1
  vals <- bca_calc(stats, .data$splits[[1]]$data, alpha = alpha, .fn = .fn, ...)
426 1
  vals
427
}

Read our documentation on viewing source code .

Loading