1
#' Monte Carlo Cross-Validation
2
#'
3
#' One resample of Monte Carlo cross-validation takes a random sample (without
4
#'  replacement) of the original data set to be used for analysis. All other
5
#'  data points are added to the assessment set.
6
#' @details The `strata` argument causes the random sampling to be conducted
7
#'  *within the stratification variable*. This can help ensure that the number of
8
#'  data points in the analysis data is equivalent to the proportions in the
9
#'  original data set. (Strata below 10% of the total are pooled together.)
10
#' @inheritParams vfold_cv
11
#' @param prop The proportion of data to be retained for modeling/analysis.
12
#' @param times The number of times to repeat the sampling.
13
#' @param strata A variable that is used to conduct stratified sampling to
14
#'  create the resamples. This could be a single character value or a variable
15
#'  name that corresponds to a variable that exists in the data frame.
16
#' @param breaks A single number giving the number of bins desired to stratify
17
#'  a numeric stratification variable.
18
#' @export
19
#' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and
20
#'  `data.frame`. The results include a column for the data split objects and a
21
#'  column called `id` that has a character string with the resample identifier.
22
#' @examples
23
#' mc_cv(mtcars, times = 2)
24
#' mc_cv(mtcars, prop = .5, times = 2)
25
#'
26
#' library(purrr)
27
#' data(wa_churn, package = "modeldata")
28
#'
29
#' set.seed(13)
30
#' resample1 <- mc_cv(wa_churn, times = 3, prop = .5)
31
#' map_dbl(resample1$splits,
32
#'         function(x) {
33
#'           dat <- as.data.frame(x)$churn
34
#'           mean(dat == "Yes")
35
#'         })
36
#'
37
#' set.seed(13)
38
#' resample2 <- mc_cv(wa_churn, strata = "churn", times = 3, prop = .5)
39
#' map_dbl(resample2$splits,
40
#'         function(x) {
41
#'           dat <- as.data.frame(x)$churn
42
#'           mean(dat == "Yes")
43
#'         })
44
#'
45
#' set.seed(13)
46
#' resample3 <- mc_cv(wa_churn, strata = "tenure", breaks = 6, times = 3, prop = .5)
47
#' map_dbl(resample3$splits,
48
#'         function(x) {
49
#'           dat <- as.data.frame(x)$churn
50
#'           mean(dat == "Yes")
51
#'         })
52
#' @export
53
mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) {
54

55 1
  if(!missing(strata)) {
56 1
    strata <- tidyselect::vars_select(names(data), !!enquo(strata))
57 1
    if(length(strata) == 0) strata <- NULL
58
  }
59

60 1
  strata_check(strata, names(data))
61

62 1
  split_objs <-
63 1
    mc_splits(data = data,
64 1
              prop = 1 - prop,
65 1
              times = times,
66 1
              strata = strata,
67 1
              breaks = breaks)
68

69
  ## We remove the holdout indices since it will save space and we can
70
  ## derive them later when they are needed.
71

72 1
  split_objs$splits <- map(split_objs$splits, rm_out)
73

74 1
  mc_att <- list(prop = prop,
75 1
                 times = times,
76 1
                 strata = !is.null(strata))
77

78 1
  new_rset(splits = split_objs$splits,
79 1
           ids = split_objs$id,
80 1
           attrib = mc_att,
81 1
           subclass = c("mc_cv", "rset"))
82
}
83

84
# Get the indices of the analysis set from the assessment set
85
mc_complement <- function(ind, n) {
86 1
  list(analysis = setdiff(1:n, ind),
87 1
       assessment = ind)
88
}
89

90

91
mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) {
92 1
  if (!is.numeric(prop) | prop >= 1 | prop <= 0)
93 0
    stop("`prop` must be a number on (0, 1).", call. = FALSE)
94

95 1
  n <- nrow(data)
96 1
  if (is.null(strata)) {
97 1
    indices <- purrr::map(rep(n, times), sample, size = floor(n * prop))
98
  } else {
99 1
    stratas <- tibble::tibble(idx = 1:n,
100 1
                              strata = make_strata(getElement(data, strata),
101 1
                                                   breaks = breaks))
102 1
    stratas <- split_unnamed(stratas, stratas$strata)
103 1
    stratas <-
104 1
      purrr::map_df(stratas, strat_sample, prop = prop, times = times)
105 1
    indices <- split_unnamed(stratas$idx, stratas$rs_id)
106
  }
107 1
  indices <- lapply(indices, mc_complement, n = n)
108 1
  split_objs <-
109 1
    purrr::map(indices, make_splits, data = data, class = "mc_split")
110 1
  list(splits = split_objs,
111 1
       id = names0(length(split_objs), "Resample"))
112
}
113

114
strat_sample <- function(x, prop, times, ...) {
115 1
  n <- nrow(x)
116 1
  idx <- purrr::map(rep(n, times), sample, size = floor(n*prop), ...)
117 1
  out <- purrr::map_df(idx, function(ind, x) x[sort(ind), "idx"], x = x)
118 1
  out$rs_id <- rep(1:times, each = floor(n*prop))
119 1
  out
120
}
121

122
#' @export
123
print.mc_cv <- function(x, ...) {
124 1
  cat("#", pretty(x), "\n")
125 1
  class(x) <- class(x)[!(class(x) %in% c("mc_cv", "rset"))]
126 1
  print(x, ...)
127
}

Read our documentation on viewing source code .

Loading