1
# Bootstrap confidence interval code
2

3
# ------------------------------------------------------------------------------
4
# helpers
5

6

7
check_rset <- function(x, app = TRUE) {
8 1
  if (!inherits(x, "bootstraps"))
9 1
    stop("`.data` should be an `rset` object generated from `bootstraps()`",
10 1
         call. = FALSE)
11

12 1
  if (app) {
13 1
    if(x %>% dplyr::filter(id == "Apparent") %>% nrow() != 1)
14 0
      stop("Please set `apparent = TRUE` in `bootstraps()` function",
15 0
           call. = FALSE)
16
  }
17 1
  invisible(NULL)
18
}
19

20

21
stat_fmt_err <- paste("`statistics` should select a list column of tidy results.")
22
stat_nm_err <- paste("The tibble in `statistics` should have columns for",
23
                     "'estimate' and 'term`")
24
std_exp <- c("std.error", "robust.se")
25

26
check_tidy_names <- function(x, std_col) {
27
  # check for proper columns
28 1
  if (sum(colnames(x) == "estimate") != 1) {
29 1
    stop(stat_nm_err, call. = FALSE)
30
  }
31 1
  if (sum(colnames(x) == "term") != 1) {
32 1
    stop(stat_nm_err, call. = FALSE)
33
  }
34 1
  if (std_col) {
35 1
    std_candidates <- colnames(x) %in% std_exp
36 1
    if (sum(std_candidates) != 1) {
37 1
      stop("`statistics` should select a single column for the standard ",
38 1
           "error.", call. = FALSE)
39
    }
40
  }
41 1
  invisible(TRUE)
42
}
43

44
check_tidy <- function(x, std_col = FALSE) {
45 1
  if (!is.list(x)) {
46 1
    stop(stat_fmt_err, call. = FALSE)
47
  }
48

49
  # convert to data frame from list
50 1
  has_id <- any(names(x) == "id")
51 1
  if (has_id) {
52 1
    list_cols <- names(x)[map_lgl(x, is_list)]
53 1
    x <- try(tidyr::unnest(x, cols = list_cols), silent = TRUE)
54
  } else {
55 1
    x <- try(map_dfr(x, ~ .x), silent = TRUE)
56
  }
57

58 1
  if (inherits(x, "try-error")) {
59 0
    stop(stat_fmt_err, call. = FALSE)
60
  }
61

62 1
  check_tidy_names(x, std_col)
63

64 1
  if (std_col) {
65 1
    std_candidates <- colnames(x) %in% std_exp
66 1
    std_candidates <- colnames(x)[std_candidates]
67 1
    if (has_id) {
68 1
      x <-
69 1
        dplyr::select(x, term, estimate, id, tidyselect::one_of(std_candidates)) %>%
70 1
        mutate(id = (id == "Apparent")) %>%
71 1
        setNames(c("term", "estimate", "orig", "std_err"))
72
    } else {
73 0
      x <-
74 0
        dplyr::select(x, term, estimate, tidyselect::one_of(std_candidates)) %>%
75 0
        setNames(c("term", "estimate", "std_err"))
76
    }
77

78
  } else {
79 1
    if (has_id) {
80 1
      x <-
81 1
        dplyr::select(x, term, estimate, id) %>%
82 1
        mutate(orig = (id == "Apparent")) %>%
83 1
        dplyr::select(-id)
84
    } else {
85 1
      x <- dplyr::select(x, term, estimate)
86
    }
87
  }
88

89 1
  x
90
}
91

92

93
get_p0 <- function(x, alpha = 0.05) {
94 1
  orig <- x %>%
95 1
    group_by(term) %>%
96 1
    dplyr::filter(orig) %>%
97 1
    dplyr::select(term, theta_0 = estimate) %>%
98 1
    ungroup()
99 1
  x %>%
100 1
    dplyr::filter(!orig) %>%
101 1
    inner_join(orig, by = "term") %>%
102 1
    group_by(term) %>%
103 1
    summarize(p0 = mean(estimate <= theta_0, na.rm = TRUE)) %>%
104 1
    mutate(Z0 = stats::qnorm(p0),
105 1
           Za = stats::qnorm(1 - alpha / 2))
106
}
107

108
new_stats <- function(x, lo, hi) {
109 1
  res <- as.numeric(quantile(x, probs = c(lo, hi), na.rm = TRUE))
110 1
  tibble(.lower = min(res), .estimate = mean(x, na.rm = TRUE), .upper = max(res))
111
}
112

113
has_dots <- function(x) {
114 1
  nms <- names(formals(x))
115 1
  if (!any(nms == "...")) {
116 1
    stop("`.fn` must have an argument `...`.", call. = FALSE)
117
  }
118 1
  invisible(NULL)
119
}
120

121
check_num_resamples <- function(x, B = 1000) {
122 1
  x <-
123 1
    x %>%
124 1
    dplyr::group_by(term) %>%
125 1
    dplyr::summarize(n = sum(!is.na(estimate))) %>%
126 1
    dplyr::filter(n < B)
127

128 1
  if (nrow(x) > 0) {
129 1
    terms <- paste0("`", x$term, "`")
130 1
    msg <-
131 1
      paste0(
132 1
        "Recommend at least ", B, " non-missing bootstrap resamples for ",
133 1
        ifelse(length(terms) > 1, "terms: ", "term "),
134 1
        paste0(terms, collapse = ", "),
135
        "."
136
      )
137 1
    warning(msg, call. = FALSE)
138
  }
139 1
  invisible(NULL)
140
}
141

142
# ------------------------------------------------------------------------------
143
# percentile code
144

145

146
pctl_single <- function(stats, alpha = 0.05) {
147

148 1
  if (all(is.na(stats)))
149 1
    stop("All statistics have missing values..", call. = FALSE)
150

151 1
  if (!is.numeric(stats))
152 1
    stop("`stats` must be a numeric vector.", call. = FALSE)
153

154
  # stats is a numeric vector of values
155 1
  ci <- stats %>% quantile(probs = c(alpha / 2, 1 - alpha / 2), na.rm = TRUE)
156

157
  # return a tibble with .lower, .estimate, .upper
158 1
  res <- tibble(
159 1
    .lower = min(ci),
160 1
    .estimate = mean(stats, na.rm = TRUE),
161 1
    .upper = max(ci),
162 1
    .alpha = alpha,
163 1
    .method = "percentile"
164
  )
165 1
  res
166
}
167

168
#' Bootstrap confidence intervals
169
#' @description
170
#' Calculate bootstrap confidence intervals using various methods.
171
#' @param .data A data frame containing the bootstrap resamples created using
172
#'  `bootstraps()`. For t- and BCa-intervals, the `apparent` argument
173
#'  should be set to `TRUE`. Even if the `apparent` argument is set to
174
#'  `TRUE` for the percentile method, the apparent data is never used in calculating
175
#'  the percentile confidence interval.
176
#' @param statistics An unquoted column name or `dplyr` selector that identifies
177
#'  a single column in the data set that contains the individual bootstrap
178
#'  estimates. This can be a list column of tidy tibbles (that contains columns
179
#'  `term` and `estimate`) or a simple numeric column. For t-intervals, a
180
#'  standard tidy column (usually called `std.err`) is required.
181
#'  See the examples below.
182
#' @param alpha Level of significance
183
#' @return Each function returns a tibble with columns `.lower`,
184
#'  `.estimate`, `.upper`, `.alpha`, `.method`, and `term`.
185
#'  `.method` is the type of interval (eg. "percentile",
186
#'  "student-t", or "BCa"). `term` is the name of the estimate. Note
187
#'  the `.estimate` returned from `int_pctl()`
188
#'  is the mean of the estimates from the bootstrap resamples
189
#'  and not the estimate from the apparent model.
190
#' @details Percentile intervals are the standard method of
191
#'  obtaining confidence intervals but require thousands of
192
#'  resamples to be accurate. T-intervals may need fewer
193
#'  resamples but require a corresponding variance estimate.
194
#'  Bias-corrected and accelerated intervals require the original function
195
#'  that was used to create the statistics of interest and are
196
#'  computationally taxing.
197
#'
198
#' @references Davison, A., & Hinkley, D. (1997). _Bootstrap Methods and their
199
#'  Application_. Cambridge: Cambridge University Press.
200
#'  doi:10.1017/CBO9780511802843
201
#'
202
#' @examples
203
#' \donttest{
204
#' library(broom)
205
#' library(dplyr)
206
#' library(purrr)
207
#' library(tibble)
208
#'
209
#' lm_est <- function(split, ...) {
210
#'   lm(mpg ~ disp + hp, data = analysis(split)) %>%
211
#'     tidy()
212
#' }
213
#'
214
#' set.seed(52156)
215
#' car_rs <-
216
#'   bootstraps(mtcars, 500, apparent = TRUE) %>%
217
#'   mutate(results = map(splits, lm_est))
218
#'
219
#' int_pctl(car_rs, results)
220
#' int_t(car_rs, results)
221
#' int_bca(car_rs, results, .fn = lm_est)
222
#'
223
#' # putting results into a tidy format
224
#' rank_corr <- function(split) {
225
#'   dat <- analysis(split)
226
#'   tibble(
227
#'     term = "corr",
228
#'     estimate = cor(dat$sqft, dat$price, method = "spearman"),
229
#'     # don't know the analytical std.err so no t-intervals
230
#'     std.err = NA_real_
231
#'   )
232
#' }
233
#'
234
#' set.seed(69325)
235
#' data(Sacramento, package = "modeldata")
236
#' bootstraps(Sacramento, 1000, apparent = TRUE) %>%
237
#'   mutate(correlations = map(splits, rank_corr)) %>%
238
#'   int_pctl(correlations)
239
#' }
240
#' @export
241
int_pctl <- function(.data, statistics, alpha = 0.05) {
242

243 1
  check_rset(.data, app = FALSE)
244 1
  if (length(alpha) != 1 || !is.numeric(alpha)) {
245 1
    abort("`alpha` must be a single numeric value.")
246
  }
247

248 1
  .data <- .data %>% dplyr::filter(id != "Apparent")
249

250 1
  column_name <- tidyselect::vars_select(names(.data), !!rlang::enquo(statistics))
251 1
  if (length(column_name) != 1) {
252 0
    stop(stat_fmt_err, call. = FALSE)
253
  }
254 1
  stats <- .data[[column_name]]
255 1
  stats <- check_tidy(stats, std_col = FALSE)
256

257 1
  check_num_resamples(stats, B = 1000)
258

259 1
  vals <- stats %>%
260 1
    dplyr::group_by(term) %>%
261 1
    dplyr::do(pctl_single(.$estimate, alpha = alpha)) %>%
262 1
    dplyr::ungroup()
263 1
  vals
264

265
}
266

267
# ------------------------------------------------------------------------------
268
# t interval code
269

270
t_single <- function(stats, std_err, is_orig, alpha = 0.05) {
271
  # stats is a numeric vector of values
272
  # vars is a numeric vector of variances
273
  # return a tibble with .lower, .estimate, .upper
274
  # which_orig is the index of stats and std_err that has the original result
275

276 1
  if (all(is.na(stats)))
277 1
    stop("All statistics have missing values.", call. = FALSE)
278

279 1
  if (!is.logical(is_orig) || any(is.na(is_orig))) {
280 0
    stop("`is_orig` should be a logical column the same length as `stats` ",
281 0
         "with no missing values.", call. = FALSE)
282
  }
283 1
  if (length(stats) != length(std_err) && length(stats) != length(is_orig)) {
284 0
    stop("`stats`, `std_err`, and `is_orig` should have the same length.",
285 0
         call. = FALSE)
286
  }
287 1
  if (sum(is_orig) != 1) {
288 0
    stop("The original statistic must be in a single row.", call. = FALSE)
289
  }
290

291 1
  theta_obs   <- stats[is_orig]
292 1
  std_err_obs <- std_err[is_orig]
293

294 1
  stats   <- stats[!is_orig]
295 1
  std_err <- std_err[!is_orig]
296

297 1
  z_dist <-
298 1
    (stats - theta_obs) / std_err
299

300 1
  z_pntl <-
301 1
    quantile(z_dist, probs = c(alpha / 2, 1 - (alpha) / 2), na.rm = TRUE)
302

303 1
  ci <- theta_obs - z_pntl * std_err_obs
304

305 1
  tibble(
306 1
    .lower = min(ci),
307 1
    .estimate = mean(stats, na.rm = TRUE),
308 1
    .upper = max(ci),
309 1
    .alpha = alpha,
310 1
    .method = "student-t"
311
  )
312
}
313

314

315
#' @rdname int_pctl
316
#' @export
317
int_t <- function(.data, statistics, alpha = 0.05) {
318

319 1
  check_rset(.data)
320 1
  if (length(alpha) != 1 || !is.numeric(alpha)) {
321 1
    abort("`alpha` must be a single numeric value.")
322
  }
323

324 1
  column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics))
325 1
  if (length(column_name) != 1) {
326 0
    stop(stat_fmt_err, call. = FALSE)
327
  }
328 1
  stats <- .data %>% dplyr::select(!!column_name, id)
329 1
  stats <- check_tidy(stats, std_col = TRUE)
330

331 1
  check_num_resamples(stats, B = 500)
332

333 1
  vals <-
334 1
    stats %>%
335 1
    dplyr::group_by(term) %>%
336 1
    dplyr::do(t_single(.$estimate, .$std_err, .$orig, alpha = alpha)) %>%
337 1
    dplyr::ungroup()
338 1
  vals
339
}
340

341

342
# ----------------------------------------------------------------
343

344
bca_calc <- function(stats, orig_data, alpha = 0.05, .fn, ...) {
345

346
  # TODO check per term
347 1
  if (all(is.na(stats$estimate))) {
348 1
    stop("All statistics have missing values.", call. = FALSE)
349
  }
350

351
  ### Estimating Z0 bias-correction
352 1
  bias_corr_stats <- get_p0(stats, alpha = alpha)
353

354
  # need the original data frame here
355 1
  loo_rs <- loo_cv(orig_data)
356

357
  # We can't be sure what we will get back from the analysis function.
358
  # To test, we run on the first LOO data set and see if it is a vector or df
359 1
  loo_test <- try(rlang::exec(.fn, loo_rs$splits[[1]], ...), silent = TRUE)
360 1
  if (inherits(loo_test, "try-error")) {
361 0
    cat("Running `.fn` on the LOO resamples produced an error:\n")
362 0
    print(loo_test)
363 0
    stop("`.fn` failed.", call. = FALSE)
364
  }
365

366 1
  loo_res <- furrr::future_map_dfr(loo_rs$splits, .fn, ...)
367

368 1
  loo_estimate <-
369 1
    loo_res %>%
370 1
    dplyr::group_by(term) %>%
371 1
    dplyr::summarize(loo = mean(estimate, na.rm = TRUE)) %>%
372 1
    dplyr::inner_join(loo_res, by = "term")  %>%
373 1
    dplyr::group_by(term) %>%
374 1
    dplyr::summarize(
375 1
      cubed = sum((loo - estimate)^3),
376 1
      squared = sum((loo - estimate)^2)
377
    ) %>%
378 1
    dplyr::ungroup() %>%
379 1
    dplyr::inner_join(bias_corr_stats, by = "term") %>%
380 1
    dplyr::mutate(
381 1
      a = cubed/(6 * (squared^(3 / 2))),
382 1
      Zu = (Z0 + Za) / ( 1 - a * (Z0 + Za)) + Z0,
383 1
      Zl = (Z0 - Za) / (1 - a * (Z0 - Za)) + Z0,
384 1
      lo = stats::pnorm(Zl, lower.tail = TRUE),
385 1
      hi = stats::pnorm(Zu, lower.tail = TRUE)
386
    )
387

388 1
  terms <- loo_estimate$term
389 1
  stats <- stats %>% dplyr::filter(!orig)
390 1
  for (i in seq_along(terms)) {
391 1
    tmp <- new_stats(stats$estimate[ stats$term == terms[i] ],
392 1
                     lo = loo_estimate$lo[i],
393 1
                     hi = loo_estimate$hi[i])
394 1
    tmp$term <- terms[i]
395 1
    if (i == 1) {
396 1
      ci_bca <- tmp
397
    } else {
398 1
      ci_bca <- bind_rows(ci_bca, tmp)
399
    }
400
  }
401 1
  ci_bca <-
402 1
    ci_bca %>%
403 1
    dplyr::select(term, .lower, .estimate, .upper) %>%
404 1
    dplyr::mutate(
405 1
      .alpha = alpha,
406 1
      .method = "BCa"
407
    )
408
}
409

410

411
#' @rdname int_pctl
412
#' @param .fn A function to calculate statistic of interest. The
413
#'  function should take an `rsplit` as the first argument and the `...` are
414
#'  required.
415
#' @param ... Arguments to pass to `.fn`.
416
#' @references \url{https://rsample.tidymodels.org/articles/Applications/Intervals.html}
417
#' @export
418
int_bca <- function(.data, statistics, alpha = 0.05, .fn, ...) {
419

420 1
  check_rset(.data)
421 1
  if (length(alpha) != 1 || !is.numeric(alpha)) {
422 1
    abort("`alpha` must be a single numeric value.")
423
  }
424

425 1
  has_dots(.fn)
426

427 1
  column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics))
428 1
  if (length(column_name) != 1) {
429 0
    stop(stat_fmt_err, call. = FALSE)
430
  }
431 1
  stats <- .data %>% dplyr::select(!!column_name, id)
432 1
  stats <- check_tidy(stats)
433

434 1
  check_num_resamples(stats, B = 1000)
435

436 1
  vals <- bca_calc(stats, .data$splits[[1]]$data, alpha = alpha, .fn = .fn, ...)
437 1
  vals
438
}

Read our documentation on viewing source code .

Loading