1
#' Bootstrap Sampling
2
#'
3
#' A bootstrap sample is a sample that is the same size as the original data
4
#'  set that is made using replacement. This results in analysis samples that
5
#'  have multiple replicates of some of the original rows of the data. The
6
#'  assessment set is defined as the rows of the original data that were not
7
#'  included in the bootstrap sample. This is often referred to as the
8
#'  "out-of-bag" (OOB) sample.
9
#' @details The argument `apparent` enables the option of an additional
10
#'  "resample" where the analysis and assessment data sets are the same as the
11
#'  original data set. This can be required for some types of analysis of the
12
#'  bootstrap results.
13
#' The `strata` argument is based on a similar argument in the random forest
14
#'  package were the bootstrap samples are conducted *within the stratification
15
#'  variable*. This can help ensure that the number of data points in the
16
#'  bootstrap sample is equivalent to the proportions in the original data set.
17
#'  (Strata below 10% of the total are pooled together.)
18
#' @inheritParams vfold_cv
19
#' @param times The number of bootstrap samples.
20
#' @param strata A variable that is used to conduct stratified sampling. When
21
#'  not `NULL`, each bootstrap sample is created within the stratification
22
#'  variable. This could be a single character value or a variable name that
23
#'  corresponds to a variable that exists in the data frame.
24
#' @param breaks A single number giving the number of bins desired to stratify
25
#'  a numeric stratification variable.
26
#' @param apparent A logical. Should an extra resample be added where the
27
#'  analysis and holdout subset are the entire data set. This is required for
28
#'  some estimators used by the `summary` function that require the apparent
29
#'  error rate.
30
#' @export
31
#' @return An tibble with classes `bootstraps`, `rset`, `tbl_df`, `tbl`, and
32
#'  `data.frame`. The results include a column for the data split objects and a
33
#'  column called `id` that has a character string with the resample identifier.
34
#' @examples
35
#' bootstraps(mtcars, times = 2)
36
#' bootstraps(mtcars, times = 2, apparent = TRUE)
37
#'
38
#' library(purrr)
39
#' data(wa_churn, package = "modeldata")
40
#'
41
#' set.seed(13)
42
#' resample1 <- bootstraps(wa_churn, times = 3)
43
#' map_dbl(resample1$splits,
44
#'         function(x) {
45
#'           dat <- as.data.frame(x)$churn
46
#'           mean(dat == "Yes")
47
#'         })
48
#'
49
#' set.seed(13)
50
#' resample2 <- bootstraps(wa_churn, strata = "churn", times = 3)
51
#' map_dbl(resample2$splits,
52
#'         function(x) {
53
#'           dat <- as.data.frame(x)$churn
54
#'           mean(dat == "Yes")
55
#'         })
56
#'
57
#' set.seed(13)
58
#' resample3 <- bootstraps(wa_churn, strata = "tenure", breaks = 6, times = 3)
59
#' map_dbl(resample3$splits,
60
#'         function(x) {
61
#'           dat <- as.data.frame(x)$churn
62
#'           mean(dat == "Yes")
63
#'         })
64
#' @export
65
bootstraps <-
66
  function(data,
67
           times = 25,
68
           strata = NULL,
69
           breaks = 4,
70
           apparent = FALSE,
71
           ...) {
72

73 1
  if(!missing(strata)) {
74 1
    strata <- tidyselect::vars_select(names(data), !!enquo(strata))
75 0
    if(length(strata) == 0) strata <- NULL
76
  }
77

78 1
  strata_check(strata, names(data))
79

80 1
  split_objs <-
81 1
    boot_splits(
82 1
      data = data,
83 1
      times = times,
84 1
      strata = strata,
85 1
      breaks = breaks
86
    )
87 1
  if(apparent)
88 1
    split_objs <- bind_rows(split_objs, apparent(data))
89

90 1
  boot_att <- list(times = times,
91 1
                   apparent = apparent,
92 1
                   strata = !is.null(strata))
93

94 1
  new_rset(splits = split_objs$splits,
95 1
           ids = split_objs$id,
96 1
           attrib = boot_att,
97 1
           subclass = c("bootstraps", "rset"))
98

99
}
100

101
# Get the indices of the analysis set from the analysis set (= bootstrap sample)
102
boot_complement <- function(ind, n) {
103 1
  list(analysis = ind, assessment = NA)
104
}
105

106
boot_splits <-
107
  function(data,
108
           times = 25,
109
           strata = NULL,
110
           breaks = 4) {
111

112 1
  n <- nrow(data)
113

114 1
  if (is.null(strata)) {
115 1
    indices <- purrr::map(rep(n, times), sample, replace = TRUE)
116
  } else {
117 1
    stratas <- tibble::tibble(idx = 1:n,
118 1
                              strata = make_strata(getElement(data, strata),
119 1
                                                   breaks = breaks))
120 1
    stratas <- split_unnamed(stratas, stratas$strata)
121 1
    stratas <-
122 1
      purrr::map_df(
123 1
        stratas,
124 1
        strat_sample,
125 1
        prop = 1,
126 1
        times = times,
127 1
        replace = TRUE
128
      )
129 1
    indices <- split_unnamed(stratas$idx, stratas$rs_id)
130
  }
131

132 1
  indices <- lapply(indices, boot_complement, n = n)
133

134 1
  split_objs <-
135 1
    purrr::map(indices, make_splits, data = data, class = "boot_split")
136 1
  list(splits = split_objs,
137 1
       id = names0(length(split_objs), "Bootstrap"))
138
}
139

140
#' @export
141
print.bootstraps <- function(x, ...) {
142 1
  cat("#", pretty(x), "\n")
143 1
  class(x) <- class(x)[!(class(x) %in% c("bootstraps", "rset"))]
144 1
  print(x, ...)
145
}

Read our documentation on viewing source code .

Loading