sahirbhatnagar / casebase
1
#' Create case-base dataset for use in fitting parametric hazard functions
2
#'
3
#' This function implements the case-base sampling approach described in Hanley
4
#' and Miettinen (2009). It can be used to fit smooth-in-time parametric
5
#' functions easily via logistic regression.
6
#'
7
#' The base series is sampled using a multinomial scheme: individuals are
8
#' sampled proportionally to their follow-up time.
9
#'
10
#' It is assumed that \code{data} contains the two columns corresponding to the
11
#' supplied time and event variables. If either the \code{time} or \code{event}
12
#' argument is missing, the function looks for columns with appropriate-looking
13
#' names (see \code{\link{checkArgsTimeEvent}}).
14
#'
15
#' @section Warning: The offset is calculated using the total follow-up time for
16
#'   all individuals in the study. Therefore, we need \code{time} to be on the
17
#'   original scale, not a transformed scale (e.g. logarithmic). Otherwise, the
18
#'   offset and the estimation will be wrong.
19
#'
20
#' @param data a data.frame or data.table containing the source dataset.
21
#' @param time a character string giving the name of the time variable. See
22
#'   Details.
23
#' @param event a character string giving the name of the event variable. See
24
#'   Details.
25
#' @param ratio Integer, giving the ratio of the size of the base series to that
26
#'   of the case series. Defaults to 10.
27
#' @param comprisk Logical. Indicates whether we have multiple event types and
28
#'   that we want to consider some of them as competing risks.
29
#' @param censored.indicator a character string of length 1 indicating which
30
#'   value in \code{event} is the censored. This function will use
31
#'   \code{\link[stats]{relevel}} to set \code{censored.indicator} as the
32
#'   reference level. This argument is ignored if the \code{event} variable is a
33
#'   numeric
34
#' @return The function returns a dataset, with the same format as the source
35
#'   dataset, and where each row corresponds to a person-moment sampled from the
36
#'   case or the base series.
37
#' @export
38
#' @examples
39
#' # Simulate censored survival data for two outcome types from exponential
40
#' library(data.table)
41
#' set.seed(12345)
42
#' nobs <- 500
43
#' tlim <- 10
44
#'
45
#' # simulation parameters
46
#' b1 <- 200
47
#' b2 <- 50
48
#'
49
#' # event type 0-censored, 1-event of interest, 2-competing event
50
#' # t observed time/endpoint
51
#' # z is a binary covariate
52
#' DT <- data.table(z = rbinom(nobs, 1, 0.5))
53
#' DT[, `:=`(
54
#'   "t_event" = rweibull(nobs, 1, b1),
55
#'   "t_comp" = rweibull(nobs, 1, b2)
56
#' )]
57
#' DT[, `:=`(
58
#'   "event" = 1 * (t_event < t_comp) + 2 * (t_event >= t_comp),
59
#'   "time" = pmin(t_event, t_comp)
60
#' )]
61
#' DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]
62
#'
63
#' out <- sampleCaseBase(DT, time = "time", event = "event", comprisk = TRUE)
64
sampleCaseBase <- function(data, time, event, ratio = 10, comprisk = FALSE,
65
                           censored.indicator) {
66
  # Get the variables names for time and event
67 2
  varNames <- checkArgsTimeEvent(data = data, time = time, event = event)
68 2
  timeVar <- varNames$time
69 2
  eventName <- varNames$event
70
  # Check the event categories
71 2
  modifiedEvent <- checkArgsEventIndicator(data, eventName, censored.indicator)
72 2
  eventVar <- modifiedEvent$event.numeric
73
  # Check if we have competing events
74 2
  if (!comprisk && modifiedEvent$nLevels > 2) {
75 2
    stop(paste(
76 2
      "For more than one event type, you should have compRisk=TRUE,",
77 2
      "or reformat your data so that there is only one event of interest."
78
    ),
79 2
    call. = FALSE
80
    )
81
  }
82
  # Create survival object from dataset
83 2
  if (comprisk) surv_type <- "mstate" else surv_type <- "right"
84 2
  survObj <- survival::Surv(data[[timeVar]],
85 2
    eventVar,
86 2
    type = surv_type
87
  )
88

89 2
  n <- nrow(survObj) # no. of subjects
90 2
  B <- sum(survObj[, "time"]) # total person-time in base
91 2
  c <- sum(survObj[, "status"] != 0) # no. of cases (events)
92 2
  b <- ratio * c # size of base series
93 2
  offset <- log(B / b) # offset so intercept = log(ID | x, t = 0 )
94

95
  # We select person-moments from individual proportional
96
  # to their total follow-up time
97 2
  prob_select <- survObj[, "time"] / B
98 2
  which_pm <- sample(n, b, replace = TRUE, prob = prob_select)
99 2
  bSeries <- as.matrix(survObj[which_pm, ])
100 2
  bSeries[, "status"] <- 0
101 2
  bSeries[, "time"] <- runif(b) * bSeries[, "time"]
102

103
  # Combine base series with covariate data
104 2
  selectTimeEvent <- !(colnames(data) %in% c(timeVar, eventName))
105 2
  bSeries <- cbind(bSeries,
106 2
                   subset(data, select = selectTimeEvent)[which_pm, ,
107 2
                                                          drop = FALSE])
108
  # Rename columns appropriately
109 2
  names(bSeries)[names(bSeries) == "status"] <- eventName
110 2
  names(bSeries)[names(bSeries) == "time"] <- timeVar
111

112 2
  cSeries <- data[which(subset(data,
113 2
                               select = (names(data) == eventName)) != 0), ]
114

115
  # Combine case and base series
116 2
  cbSeries <- rbind(cSeries, bSeries)
117
  # Add offset to dataset
118 2
  cbSeries <- cbind(cbSeries, rep_len(offset, nrow(cbSeries)))
119 2
  names(cbSeries)[ncol(cbSeries)] <- "offset"
120

121 2
  class(cbSeries) <- c("cbData", class(cbSeries))
122 2
  return(cbSeries)
123
}

Read our documentation on viewing source code .

Loading