1
#' Nested or Double Resampling
2
#'
3
#' `nested_cv` can be used to take the results of one resampling procedure
4
#'   and conduct further resamples within each split. Any type of resampling
5
#'   used in `rsample` can be used.
6
#'
7
#' @details
8
#' It is a bad idea to use bootstrapping as the outer resampling procedure (see
9
#'   the example below)
10
#' @param data  A data frame.
11
#' @param outside The initial resampling specification. This can be an already
12
#'   created object or an expression of a new object (see the examples below).
13
#'   If the latter is used, the `data` argument does not need to be
14
#'   specified and, if it is given, will be ignored.
15
#' @param inside An expression for the type of resampling to be conducted
16
#'   within the initial procedure.
17
#' @return  An tibble with `nested_cv` class and any other classes that
18
#'   outer resampling process normally contains. The results include a
19
#'  column for the outer data split objects, one or more `id` columns,
20
#'  and a column of nested tibbles called `inner_resamples` with the
21
#'  additional resamples.
22
#' @examples
23
#' ## Using expressions for the resampling procedures:
24
#' nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5))
25
#'
26
#' ## Using an existing object:
27
#' folds <- vfold_cv(mtcars)
28
#' nested_cv(mtcars, folds, inside = bootstraps(times = 5))
29
#'
30
#' ## The dangers of outer bootstraps:
31
#' set.seed(2222)
32
#' bad_idea <- nested_cv(mtcars,
33
#'                       outside = bootstraps(times = 5),
34
#'                       inside = vfold_cv(v = 3))
35
#'
36
#' first_outer_split <- bad_idea$splits[[1]]
37
#' outer_analysis <- as.data.frame(first_outer_split)
38
#' sum(grepl("Volvo 142E", rownames(outer_analysis)))
39
#'
40
#' ## For the 3-fold CV used inside of each bootstrap, how are the replicated
41
#' ## `Volvo 142E` data partitioned?
42
#' first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]]
43
#' inner_analysis <- as.data.frame(first_inner_split)
44
#' inner_assess   <- as.data.frame(first_inner_split, data = "assessment")
45
#'
46
#' sum(grepl("Volvo 142E", rownames(inner_analysis)))
47
#' sum(grepl("Volvo 142E", rownames(inner_assess)))
48
#' @export
49
nested_cv <- function(data, outside, inside)  {
50 1
  nest_args <- formalArgs(nested_cv)
51 1
  cl <- match.call()
52

53 1
  boot_msg <-
54 1
    paste0(
55 1
      "Using bootstrapping as the outer resample is dangerous ",
56 1
      "since the inner resample might have the same data ",
57 1
      "point in both the analysis and assessment set."
58
    )
59

60 1
  outer_cl <- cl[["outside"]]
61 1
  if (is_call(outer_cl)) {
62 1
    if (grepl("^bootstraps", deparse(outer_cl)))
63 1
      warning(boot_msg, call. = FALSE)
64 1
    outer_cl$data <- quote(data)
65 1
    outside <- eval(outer_cl)
66
  } else {
67 0
    if (inherits(outside, "bootstraps"))
68 0
      warning(boot_msg, call. = FALSE)
69
  }
70

71 1
  inner_cl <- cl[["inside"]]
72 1
  if (!is_call(inner_cl))
73 1
    stop(
74 1
      "`inside` should be a expression such as `vfold()` or ",
75 1
      "bootstraps(times = 10)` instead of a existing object.",
76 1
      call. = FALSE
77
    )
78 1
  inside <- map(outside$splits, inside_resample, cl = inner_cl)
79

80 1
  out <- dplyr::mutate(outside, inner_resamples = inside)
81

82 1
  out <- add_class(out, cls = "nested_cv", at_end = FALSE)
83

84 1
  attr(out, "outside") <- cl$outside
85 1
  attr(out, "inside") <- cl$inside
86

87 1
  out
88
}
89

90
inside_resample <- function(src, cl) {
91 1
  cl$data <- quote(as.data.frame(src))
92 1
  eval(cl)
93
}
94

95
#' @export
96
print.nested_cv <- function(x, ...) {
97 1
  char_x <- paste("#", pretty(x))
98 1
  cat(char_x, sep = "\n")
99 1
  class(x) <- class(tibble())
100 1
  print(x, ...)
101
}

Read our documentation on viewing source code .

Loading