generalize and export some internal functions for splitting
1 |
#' Tidy Resampling Object
|
|
2 |
#'
|
|
3 |
#' The `tidy` function from the \pkg{broom} package can be used on `rset` and
|
|
4 |
#' `rsplit` objects to generate tibbles with which rows are in the analysis and
|
|
5 |
#' assessment sets.
|
|
6 |
#' @param x A `rset` or `rsplit` object
|
|
7 |
#' @param unique_ind Should unique row identifiers be returned? For example,
|
|
8 |
#' if `FALSE` then bootstrapping results will include multiple rows in the
|
|
9 |
#' sample for the same row in the original data.
|
|
10 |
#' @param ... Not currently used.
|
|
11 |
#' @return A tibble with columns `Row` and `Data`. The latter has possible
|
|
12 |
#' values "Analysis" or "Assessment". For `rset` inputs, identification columns
|
|
13 |
#' are also returned but their names and values depend on the type of
|
|
14 |
#' resampling. `vfold_cv` contains a column "Fold" and, if repeats are used,
|
|
15 |
#' another called "Repeats". `bootstraps` and `mc_cv` use the column
|
|
16 |
#' "Resample".
|
|
17 |
#' @details Note that for nested resampling, the rows of the inner resample,
|
|
18 |
#' named `inner_Row`, are *relative* row indices and do not correspond to the
|
|
19 |
#' rows in the original data set.
|
|
20 |
#' @examples
|
|
21 |
#' library(ggplot2)
|
|
22 |
#' theme_set(theme_bw())
|
|
23 |
#'
|
|
24 |
#' set.seed(4121)
|
|
25 |
#' cv <- tidy(vfold_cv(mtcars, v = 5))
|
|
26 |
#' ggplot(cv, aes(x = Fold, y = Row, fill = Data)) +
|
|
27 |
#' geom_tile() + scale_fill_brewer()
|
|
28 |
#'
|
|
29 |
#' set.seed(4121)
|
|
30 |
#' rcv <- tidy(vfold_cv(mtcars, v = 5, repeats = 2))
|
|
31 |
#' ggplot(rcv, aes(x = Fold, y = Row, fill = Data)) +
|
|
32 |
#' geom_tile() + facet_wrap(~Repeat) + scale_fill_brewer()
|
|
33 |
#'
|
|
34 |
#' set.seed(4121)
|
|
35 |
#' mccv <- tidy(mc_cv(mtcars, times = 5))
|
|
36 |
#' ggplot(mccv, aes(x = Resample, y = Row, fill = Data)) +
|
|
37 |
#' geom_tile() + scale_fill_brewer()
|
|
38 |
#'
|
|
39 |
#' set.seed(4121)
|
|
40 |
#' bt <- tidy(bootstraps(mtcars, time = 5))
|
|
41 |
#' ggplot(bt, aes(x = Resample, y = Row, fill = Data)) +
|
|
42 |
#' geom_tile() + scale_fill_brewer()
|
|
43 |
#'
|
|
44 |
#' dat <- data.frame(day = 1:30)
|
|
45 |
#' # Resample by week instead of day
|
|
46 |
#' ts_cv <- rolling_origin(dat, initial = 7, assess = 7,
|
|
47 |
#' skip = 6, cumulative = FALSE)
|
|
48 |
#' ts_cv <- tidy(ts_cv)
|
|
49 |
#' ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
|
|
50 |
#' geom_tile() + scale_fill_brewer()
|
|
51 |
#' @export
|
|
52 |
tidy.rsplit <- function(x, unique_ind = TRUE, ...) { |
|
53 | 1 |
if (unique_ind) x$in_id <- unique(x$in_id) |
54 | 1 |
out <- tibble(Row = c(x$in_id, complement(x)), |
55 | 1 |
Data = rep(c("Analysis", "Assessment"), |
56 | 1 |
c(length(x$in_id), length(complement(x))))) |
57 | 1 |
out <- dplyr::arrange(.data = out, Data, Row) |
58 | 1 |
out |
59 |
}
|
|
60 |
|
|
61 |
#' @rdname tidy.rsplit
|
|
62 |
#' @export
|
|
63 |
tidy.rset <- function(x, ...) { |
|
64 | 1 |
dots <- list(...) |
65 | 1 |
unique_ind <- dplyr::if_else(is.null(dots$unique_ind), |
66 | 1 |
TRUE, |
67 | 1 |
dots$unique_ind) |
68 | 1 |
stacked <- purrr::map(x$splits, tidy, unique_ind = unique_ind) |
69 | 1 |
for (i in seq(along = stacked)) |
70 | 1 |
stacked[[i]]$Resample <- x$id[i] |
71 | 1 |
stacked <- dplyr::bind_rows(stacked) |
72 | 1 |
stacked <- dplyr::arrange(.data = stacked, Data, Row) |
73 | 1 |
stacked |
74 |
}
|
|
75 |
#' @rdname tidy.rsplit
|
|
76 |
#' @export
|
|
77 |
tidy.vfold_cv <- function(x, ...) { |
|
78 | 1 |
stacked <- purrr::map(x$splits, tidy) |
79 | 1 |
for (i in seq(along = stacked)) { |
80 | 1 |
if (attr(x, "repeats") > 1) { |
81 | 1 |
stacked[[i]]$Repeat <- x$id[i] |
82 | 1 |
stacked[[i]]$Fold <- x$id2[i] |
83 |
} else |
|
84 | 1 |
stacked[[i]]$Fold <- x$id[i] |
85 |
}
|
|
86 | 1 |
stacked <- dplyr::bind_rows(stacked) |
87 | 1 |
stacked <- dplyr::arrange(.data = stacked, Data, Row) |
88 | 1 |
stacked |
89 |
}
|
|
90 |
|
|
91 |
#' @rdname tidy.rsplit
|
|
92 |
#' @export
|
|
93 |
tidy.nested_cv <- function(x, ...) { |
|
94 |
|
|
95 |
x$inner_tidy <- purrr::map(x$inner_resamples, tidy_wrap) |
|
96 |
inner_tidy <- tidyr::unnest(x, inner_tidy) |
|
97 |
class(x) <- class(x)[class(x) != "nested_cv"] |
|
98 |
outer_tidy <- tidy(x) |
|
99 |
id_cols <- names(outer_tidy) |
|
100 |
id_cols <- id_cols[!(id_cols %in% c("Row", "Data"))] |
|
101 |
|
|
102 |
inner_id <- grep("^id", names(inner_tidy)) |
|
103 |
if (length(inner_id) != length(id_cols)) |
|
104 |
stop("Cannot merge tidt data sets", call. = FALSE) |
|
105 |
names(inner_tidy)[inner_id] <- id_cols |
|
106 |
full_join(outer_tidy, inner_tidy, by = id_cols) |
|
107 |
}
|
|
108 |
|
|
109 |
tidy_wrap <- function(x) { |
|
110 |
x <- tidy(x) |
|
111 |
names(x) <- paste0("inner_", names(x)) |
|
112 |
x |
|
113 |
}
|
Read our documentation on viewing source code .