Help documentation updates
Update slide.R help doc
1 |
#' Monte Carlo Cross-Validation
|
|
2 |
#'
|
|
3 |
#' One resample of Monte Carlo cross-validation takes a random sample (without
|
|
4 |
#' replacement) of the original data set to be used for analysis. All other
|
|
5 |
#' data points are added to the assessment set.
|
|
6 |
#' @details The `strata` argument causes the random sampling to be conducted
|
|
7 |
#' *within the stratification variable*. This can help ensure that the number of
|
|
8 |
#' data points in the analysis data is equivalent to the proportions in the
|
|
9 |
#' original data set. (Strata below 10% of the total are pooled together.)
|
|
10 |
#' @inheritParams vfold_cv
|
|
11 |
#' @param prop The proportion of data to be retained for modeling/analysis.
|
|
12 |
#' @param times The number of times to repeat the sampling.
|
|
13 |
#' @param strata A variable that is used to conduct stratified sampling to
|
|
14 |
#' create the resamples. This could be a single character value or a variable
|
|
15 |
#' name that corresponds to a variable that exists in the data frame.
|
|
16 |
#' @param breaks A single number giving the number of bins desired to stratify
|
|
17 |
#' a numeric stratification variable.
|
|
18 |
#' @export
|
|
19 |
#' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and
|
|
20 |
#' `data.frame`. The results include a column for the data split objects and a
|
|
21 |
#' column called `id` that has a character string with the resample identifier.
|
|
22 |
#' @examples
|
|
23 |
#' mc_cv(mtcars, times = 2)
|
|
24 |
#' mc_cv(mtcars, prop = .5, times = 2)
|
|
25 |
#'
|
|
26 |
#' library(purrr)
|
|
27 |
#' data(wa_churn, package = "modeldata")
|
|
28 |
#'
|
|
29 |
#' set.seed(13)
|
|
30 |
#' resample1 <- mc_cv(wa_churn, times = 3, prop = .5)
|
|
31 |
#' map_dbl(resample1$splits,
|
|
32 |
#' function(x) {
|
|
33 |
#' dat <- as.data.frame(x)$churn
|
|
34 |
#' mean(dat == "Yes")
|
|
35 |
#' })
|
|
36 |
#'
|
|
37 |
#' set.seed(13)
|
|
38 |
#' resample2 <- mc_cv(wa_churn, strata = "churn", times = 3, prop = .5)
|
|
39 |
#' map_dbl(resample2$splits,
|
|
40 |
#' function(x) {
|
|
41 |
#' dat <- as.data.frame(x)$churn
|
|
42 |
#' mean(dat == "Yes")
|
|
43 |
#' })
|
|
44 |
#'
|
|
45 |
#' set.seed(13)
|
|
46 |
#' resample3 <- mc_cv(wa_churn, strata = "tenure", breaks = 6, times = 3, prop = .5)
|
|
47 |
#' map_dbl(resample3$splits,
|
|
48 |
#' function(x) {
|
|
49 |
#' dat <- as.data.frame(x)$churn
|
|
50 |
#' mean(dat == "Yes")
|
|
51 |
#' })
|
|
52 |
#' @export
|
|
53 |
mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) { |
|
54 |
|
|
55 | 1 |
if(!missing(strata)) { |
56 | 1 |
strata <- tidyselect::vars_select(names(data), !!enquo(strata)) |
57 | 1 |
if(length(strata) == 0) strata <- NULL |
58 |
}
|
|
59 |
|
|
60 | 1 |
strata_check(strata, names(data)) |
61 |
|
|
62 | 1 |
split_objs <-
|
63 | 1 |
mc_splits(data = data, |
64 | 1 |
prop = 1 - prop, |
65 | 1 |
times = times, |
66 | 1 |
strata = strata, |
67 | 1 |
breaks = breaks) |
68 |
|
|
69 |
## We remove the holdout indices since it will save space and we can
|
|
70 |
## derive them later when they are needed.
|
|
71 |
|
|
72 | 1 |
split_objs$splits <- map(split_objs$splits, rm_out) |
73 |
|
|
74 | 1 |
mc_att <- list(prop = prop, |
75 | 1 |
times = times, |
76 | 1 |
strata = !is.null(strata)) |
77 |
|
|
78 | 1 |
new_rset(splits = split_objs$splits, |
79 | 1 |
ids = split_objs$id, |
80 | 1 |
attrib = mc_att, |
81 | 1 |
subclass = c("mc_cv", "rset")) |
82 |
}
|
|
83 |
|
|
84 |
# Get the indices of the analysis set from the assessment set
|
|
85 |
mc_complement <- function(ind, n) { |
|
86 | 1 |
list(analysis = setdiff(1:n, ind), |
87 | 1 |
assessment = ind) |
88 |
}
|
|
89 |
|
|
90 |
|
|
91 |
mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) { |
|
92 | 1 |
if (!is.numeric(prop) | prop >= 1 | prop <= 0) |
93 |
stop("`prop` must be a number on (0, 1).", call. = FALSE) |
|
94 |
|
|
95 | 1 |
n <- nrow(data) |
96 | 1 |
if (is.null(strata)) { |
97 | 1 |
indices <- purrr::map(rep(n, times), sample, size = floor(n * prop)) |
98 |
} else { |
|
99 | 1 |
stratas <- tibble::tibble(idx = 1:n, |
100 | 1 |
strata = make_strata(getElement(data, strata), |
101 | 1 |
breaks = breaks)) |
102 | 1 |
stratas <- split_unnamed(stratas, stratas$strata) |
103 | 1 |
stratas <-
|
104 | 1 |
purrr::map_df(stratas, strat_sample, prop = prop, times = times) |
105 | 1 |
indices <- split_unnamed(stratas$idx, stratas$rs_id) |
106 |
}
|
|
107 | 1 |
indices <- lapply(indices, mc_complement, n = n) |
108 | 1 |
split_objs <-
|
109 | 1 |
purrr::map(indices, make_splits, data = data, class = "mc_split") |
110 | 1 |
list(splits = split_objs, |
111 | 1 |
id = names0(length(split_objs), "Resample")) |
112 |
}
|
|
113 |
|
|
114 |
strat_sample <- function(x, prop, times, ...) { |
|
115 | 1 |
n <- nrow(x) |
116 | 1 |
idx <- purrr::map(rep(n, times), sample, size = floor(n*prop), ...) |
117 | 1 |
out <- purrr::map_df(idx, function(ind, x) x[sort(ind), "idx"], x = x) |
118 | 1 |
out$rs_id <- rep(1:times, each = floor(n*prop)) |
119 | 1 |
out |
120 |
}
|
|
121 |
|
|
122 |
#' @export
|
|
123 |
print.mc_cv <- function(x, ...) { |
|
124 | 1 |
cat("#", pretty(x), "\n") |
125 | 1 |
class(x) <- class(x)[!(class(x) %in% c("mc_cv", "rset"))] |
126 | 1 |
print(x, ...) |
127 |
}
|
Read our documentation on viewing source code .