Help documentation updates
Update slide.R help doc
1 |
#' Rolling Origin Forecast Resampling
|
|
2 |
#'
|
|
3 |
#' This resampling method is useful when the data set has a strong time
|
|
4 |
#' component. The resamples are not random and contain data points that are
|
|
5 |
#' consecutive values. The function assumes that the original data set are
|
|
6 |
#' sorted in time order.
|
|
7 |
#' @details The main options, `initial` and `assess`, control the number of
|
|
8 |
#' data points from the original data that are in the analysis and assessment
|
|
9 |
#' set, respectively. When `cumulative = TRUE`, the analysis set will grow as
|
|
10 |
#' resampling continues while the assessment set size will always remain
|
|
11 |
#' static.
|
|
12 |
#' `skip` enables the function to not use every data point in the resamples.
|
|
13 |
#' When `skip = 0`, the resampling data sets will increment by one position.
|
|
14 |
#' Suppose that the rows of a data set are consecutive days. Using `skip = 6`
|
|
15 |
#' will make the analysis data set to operate on *weeks* instead of days. The
|
|
16 |
#' assessment set size is not affected by this option.
|
|
17 |
#' @seealso
|
|
18 |
#' [sliding_window()], [sliding_index()], and [sliding_period()] for additional
|
|
19 |
#' time based resampling functions.
|
|
20 |
#' @inheritParams vfold_cv
|
|
21 |
#' @param initial The number of samples used for analysis/modeling in the
|
|
22 |
#' initial resample.
|
|
23 |
#' @param assess The number of samples used for each assessment resample.
|
|
24 |
#' @param cumulative A logical. Should the analysis resample grow beyond the
|
|
25 |
#' size specified by `initial` at each resample?.
|
|
26 |
#' @param skip A integer indicating how many (if any) _additional_ resamples
|
|
27 |
#' to skip to thin the total amount of data points in the analysis resample.
|
|
28 |
#' See the example below.
|
|
29 |
#' @param lag A value to include a lag between the assessment
|
|
30 |
#' and analysis set. This is useful if lagged predictors will be used
|
|
31 |
#' during training and testing.
|
|
32 |
#' @export
|
|
33 |
#' @return An tibble with classes `rolling_origin`, `rset`, `tbl_df`, `tbl`,
|
|
34 |
#' and `data.frame`. The results include a column for the data split objects
|
|
35 |
#' and a column called `id` that has a character string with the resample
|
|
36 |
#' identifier.
|
|
37 |
#' @examples
|
|
38 |
#' set.seed(1131)
|
|
39 |
#' ex_data <- data.frame(row = 1:20, some_var = rnorm(20))
|
|
40 |
#' dim(rolling_origin(ex_data))
|
|
41 |
#' dim(rolling_origin(ex_data, skip = 2))
|
|
42 |
#' dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE))
|
|
43 |
#'
|
|
44 |
#' # You can also roll over calendar periods by first nesting by that period,
|
|
45 |
#' # which is especially useful for irregular series where a fixed window
|
|
46 |
#' # is not useful. This example slides over 5 years at a time.
|
|
47 |
#' library(dplyr)
|
|
48 |
#' library(tidyr)
|
|
49 |
#' data(drinks, package = "modeldata")
|
|
50 |
#'
|
|
51 |
#' drinks_annual <- drinks %>%
|
|
52 |
#' mutate(year = as.POSIXlt(date)$year + 1900) %>%
|
|
53 |
#' nest(-year)
|
|
54 |
#'
|
|
55 |
#' multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE)
|
|
56 |
#'
|
|
57 |
#' analysis(multi_year_roll$splits[[1]])
|
|
58 |
#' assessment(multi_year_roll$splits[[1]])
|
|
59 |
#'
|
|
60 |
#' @export
|
|
61 |
rolling_origin <- function(data, initial = 5, assess = 1, |
|
62 |
cumulative = TRUE, skip = 0, lag = 0, ...) { |
|
63 | 1 |
n <- nrow(data) |
64 |
|
|
65 | 1 |
if (n < initial + assess) |
66 |
stop("There should be at least ", |
|
67 |
initial + assess, |
|
68 |
" nrows in `data`", |
|
69 |
call. = FALSE) |
|
70 |
|
|
71 | 1 |
if (!is.numeric(lag) | !(lag%%1==0)) { |
72 |
stop("`lag` must be a whole number.", call. = FALSE) |
|
73 |
}
|
|
74 |
|
|
75 | 1 |
if (lag > initial) { |
76 |
stop("`lag` must be less than or equal to the number of training observations.", call. = FALSE) |
|
77 |
}
|
|
78 |
|
|
79 | 1 |
stops <- seq(initial, (n - assess), by = skip + 1) |
80 | 1 |
starts <- if (!cumulative) { |
81 | 1 |
stops - initial + 1 |
82 |
} else { |
|
83 | 1 |
starts <- rep(1, length(stops)) |
84 |
}
|
|
85 |
|
|
86 | 1 |
in_ind <- mapply(seq, starts, stops, SIMPLIFY = FALSE) |
87 | 1 |
out_ind <-
|
88 | 1 |
mapply(seq, stops + 1 - lag, stops + assess, SIMPLIFY = FALSE) |
89 | 1 |
indices <- mapply(merge_lists, in_ind, out_ind, SIMPLIFY = FALSE) |
90 | 1 |
split_objs <-
|
91 | 1 |
purrr::map(indices, make_splits, data = data, class = "rof_split") |
92 | 1 |
split_objs <- list(splits = split_objs, |
93 | 1 |
id = names0(length(split_objs), "Slice")) |
94 |
|
|
95 | 1 |
roll_att <- list(initial = initial, |
96 | 1 |
assess = assess, |
97 | 1 |
cumulative = cumulative, |
98 | 1 |
skip = skip, |
99 | 1 |
lag = lag) |
100 |
|
|
101 | 1 |
new_rset(splits = split_objs$splits, |
102 | 1 |
ids = split_objs$id, |
103 | 1 |
attrib = roll_att, |
104 | 1 |
subclass = c("rolling_origin", "rset")) |
105 |
}
|
|
106 |
|
|
107 |
#' @export
|
|
108 |
print.rolling_origin <- function(x, ...) { |
|
109 |
cat("#", pretty(x), "\n") |
|
110 |
class(x) <- class(x)[!(class(x) %in% c("rolling_origin", "rset"))] |
|
111 |
print(x, ...) |
|
112 |
}
|
Read our documentation on viewing source code .