sahirbhatnagar / casebase
1
# This is where all functions related to model fitting should appear
2

3
#' Fit smooth-in-time parametric hazard functions.
4
#'
5
#' Miettinen and Hanley (2009) explained how case-base sampling can be used to
6
#' estimate smooth-in-time parametric hazard functions. The idea is to sample
7
#' person-moments, which may or may not correspond to an event, and then fit the
8
#' hazard using logistic regression.
9
#'
10
#' The object \code{data} should either be the output of the function
11
#' \code{\link{sampleCaseBase}} or the source dataset on which case-base
12
#' sampling will be performed. In the latter case, it is assumed that
13
#' \code{data} contains the two columns corresponding to the supplied time and
14
#' event variables. The variable \code{time} is used for the sampling the base
15
#' series, and therefore it should represent the time variable on its original
16
#' (i.e. non transformed) scale. If \code{time} is missing, the function looks
17
#' for a column named \code{"time"} in the data. Note that the event variable is
18
#' inferred from \code{formula}, since it is the left hand side.
19
#'
20
#' For single-event survival analysis, it is possible to fit the hazard function
21
#' using \code{glmnet}, \code{gam}, or \code{gbm}. The choice of fitting family
22
#' is controlled by the parameter \code{family}. The default value is \code{glm},
23
#' which corresponds to logistic regression. For competing risk analysis, only
24
#' \code{glm} and \code{glmnet} are allowed.
25
#'
26
#' We also provide a matrix interface through \code{fitSmoothHazard.fit}, which
27
#' mimics \code{glm.fit} and \code{gbm.fit}. This is mostly convenient for
28
#' \code{family = "glmnet"}, since a formula interface becomes quickly
29
#' cumbersome as the number of variables increases. In this setting, the matrix
30
#' \code{y} should have two columns and contain the time and event variables
31
#' (e.g. like the output of \code{survival::Surv}). We need this linear function
32
#' of time in order to perform case-base sampling. Therefore, nonlinear
33
#' functions of time should be specified as a one-sided formula through the
34
#' argument \code{formula_time} (the left-hand side is always ignored).
35
#'
36
#' @param formula an object of class "formula" (or one that can be coerced to
37
#'   that class): a symbolic description of the model to be fitted. The details
38
#'   of model specification are given under Details.
39
#' @param data a data frame, list or environment containing the variables in the
40
#'   model. If not found in data, the variables are taken from
41
#'   \code{environment(formula)}, typically the environment from which
42
#'   \code{fitSmoothHazard} is called.
43
#' @param time a character string giving the name of the time variable. See
44
#'   Details.
45
#' @param family a character string specifying the family of regression models
46
#'   used to fit the hazard.
47
#' @param censored.indicator a character string of length 1 indicating which
48
#'   value in \code{event} is the censored. This function will use
49
#'   \code{\link[stats]{relevel}} to set \code{censored.indicator} as the
50
#'   reference level. This argument is ignored if the \code{event} variable is a
51
#'   numeric.
52
#' @param ratio integer, giving the ratio of the size of the base series to that
53
#'   of the case series. Defaults to 100.
54
#' @param ... Additional parameters passed to fitting functions (e.g.
55
#'   \code{glm}, \code{glmnet}, \code{gam}).
56
#' @return An object of \code{glm} and \code{lm} when there is only one event of
57
#'   interest, or of class \code{\link{CompRisk}}, which inherits from
58
#'   \code{vglm}, for a competing risk analysis. As such, functions like
59
#'   \code{summary}, \code{deviance} and \code{coefficients} give familiar
60
#'   results.
61
#' @export
62
#' @examples
63
#' # Simulate censored survival data for two outcome types from exponential
64
#' # distributions
65
#' library(data.table)
66
#' nobs <- 500
67
#' tlim <- 20
68
#'
69
#' # simulation parameters
70
#' b1 <- 200
71
#' b2 <- 50
72
#'
73
#' # event type 0-censored, 1-event of interest, 2-competing event
74
#' # t observed time/endpoint
75
#' # z is a binary covariate
76
#' DT <- data.table(z = rbinom(nobs, 1, 0.5))
77
#' DT[, `:=`(
78
#'   "t_event" = rweibull(nobs, 1, b1),
79
#'   "t_comp" = rweibull(nobs, 1, b2)
80
#' )]
81
#' DT[, `:=`(
82
#'   "event" = 1 * (t_event < t_comp) + 2 * (t_event >= t_comp),
83
#'   "time" = pmin(t_event, t_comp)
84
#' )]
85
#' DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]
86
#'
87
#' out_linear <- fitSmoothHazard(event ~ time + z, DT, ratio = 10)
88
#' out_log <- fitSmoothHazard(event ~ log(time) + z, DT, ratio = 10)
89
#'
90
#' # Use GAMs
91
#' library(mgcv)
92
#' DT[event == 2, event := 1]
93
#' out_gam <- fitSmoothHazard(event ~ s(time) + z, DT,
94
#'                            ratio = 10, family = "gam")
95
#' @importMethodsFrom VGAM summary predict
96
#' @importFrom VGAM vglm multinomial summaryvglm
97
#' @importFrom mgcv s te ti t2
98
fitSmoothHazard <- function(formula, data, time,
99
                            family = c("glm", "gam", "gbm", "glmnet"),
100
                            censored.indicator, ratio = 100, ...) {
101 2
  family <- match.arg(family)
102 2
  if (family == "gbm" && !requireNamespace("gbm", quietly = TRUE)) {
103 0
    stop("Pkg gbm needed for this function to work. Please install it.",
104 0
      call. = FALSE
105
    )
106
  }
107 2
  if (family == "glmnet" && !requireNamespace("glmnet", quietly = TRUE)) {
108 0
    stop("Pkg glmnet needed for this function to work. Please install it.",
109 0
      call. = FALSE
110
    )
111
  }
112 2
  formula <- expand_dot_formula(formula, data = data)
113
  # Infer name of event variable from LHS of formula
114 2
  eventVar <- all.vars(formula[[2]])
115

116 2
  if (missing(time)) {
117 2
    varNames <- checkArgsTimeEvent(data = data, event = eventVar)
118 2
    timeVar <- varNames$time
119
  } else {
120 2
    timeVar <- time
121
  }
122

123 2
  typeEvents <- sort(unique(data[[eventVar]]))
124
  # Call sampleCaseBase if class is not cbData
125 2
  if (!inherits(data, "cbData")) {
126 2
    originalData <- as.data.frame(data)
127 2
    if (missing(censored.indicator)) {
128 2
      sampleData <- sampleCaseBase(originalData, timeVar, eventVar,
129 2
        comprisk = (length(typeEvents) > 2),
130 2
        ratio
131
      )
132
    } else {
133 0
      sampleData <- sampleCaseBase(originalData, timeVar, eventVar,
134 0
        comprisk = (length(typeEvents) > 2),
135 0
        censored.indicator, ratio
136
      )
137
    }
138
  } else {
139
    # If class is cbData we no longer have the original data
140 2
    originalData <- NULL
141 2
    sampleData <- data
142
  }
143

144 2
  if (family != "glmnet") {
145
      # Update formula to add offset term
146
      # glmnet is handled as separate argument
147 2
      formula <- update(formula, ~ . + offset(offset))
148
  }
149

150
    # gbm doesn't play nice with interactions and functions of time
151 2
    if (family == "gbm") {
152
        # So warn the user
153 2
        if (detect_nonlinear_time(formula, timeVar)) {
154 2
          warning(sprintf(paste("You may be using a nonlinear function",
155 2
                                "of %s.\ngbm may throw an error."),
156 2
                          timeVar),
157 2
                  call. = FALSE)
158
        }
159 2
        if (detect_interaction(formula)) {
160 2
            warning("gbm may throw an error when using interaction terms",
161 2
                    call. = FALSE)
162
        }
163
    }
164

165
  # Fit a binomial model if there are no competing risks
166 2
  if (length(typeEvents) == 2) {
167 2
    fittingFunction <- switch(family,
168 2
      "glm" = function(formula) glm(formula, data = sampleData,
169 2
                                    family = binomial),
170 2
      "glmnet" = function(formula) cv.glmnet.formula(formula, sampleData,
171 2
                                                     event = eventVar, ...),
172 2
      "gam" = function(formula) mgcv::gam(formula, sampleData,
173 2
                                          family = "binomial", ...),
174 2
      "gbm" = function(formula) gbm::gbm(formula, sampleData,
175 2
                                         distribution = "bernoulli", ...)
176
    )
177

178 2
    out <- fittingFunction(formula)
179 2
    out$originalData <- originalData
180 2
    out$typeEvents <- typeEvents
181 2
    out$timeVar <- timeVar
182 2
    out$eventVar <- eventVar
183 2
    if (family == "glmnet") out$formula <- formula
184
    # Reset offset for absolute risk estimation, but keep track of it
185 2
    out$offset <- out$data$offset
186 2
    out$data$offset <- 0
187
    # Add new class
188 2
    class(out) <- c("singleEventCB", class(out))
189
  } else {
190
    # Otherwise fit a multinomial regression
191 2
    if (!family %in% c("glm")) {
192 2
      stop(sprintf("Competing-risk analysis is not available for family=%s",
193 2
                   family),
194 2
        .call = FALSE
195
      )
196
    }
197

198 2
    fittingFunction <- function(formula) {
199 2
      VGAM::vglm(formula,
200 2
                 data = sampleData,
201 2
                 family = multinomial(refLevel = 1))
202
      }
203

204
    # Turn off warnings from VGAM::vglm.fitter
205 2
    withCallingHandlers(model <- fittingFunction(formula),
206 2
      warning = handler_fitter
207
    )
208

209
    # The output is an S4 object that extends vglm-class when family='glm'
210
    # Otherwise it's just an S3 object like above
211 2
    out <- switch(family,
212 2
      "glm" = new("CompRisk", model,
213 2
        originalData = originalData,
214 2
        typeEvents = typeEvents,
215 2
        timeVar = timeVar,
216 2
        eventVar = eventVar
217
      ),
218 2
      "glmnet" = structure(
219 2
        c(
220 2
          model,
221 2
          list(
222 2
            "originalData" = originalData,
223 2
            "typeEvents" = typeEvents,
224 2
            "timeVar" = timeVar,
225 2
            "eventVar" = eventVar,
226 2
            "formula" = formula
227
          )
228
        ),
229 2
        class = c("CompRiskGlmnet", class(model))
230
      )
231
    )
232
  }
233 2
  return(out)
234
}
235

236
#' @export
237
#' @rdname fitSmoothHazard
238
#' @param x Matrix containing covariates.
239
#' @param y Matrix containing two columns: one corresponding to time, the other
240
#'   to the event type.
241
#' @param formula_time A formula describing how the hazard depends on time.
242
#'   Defaults to linear.
243
#' @param event a character string giving the name of the event variable.
244
#' @importFrom stats glm.fit
245
fitSmoothHazard.fit <- function(x, y, formula_time, time, event,
246
                                family = c("glm", "gbm", "glmnet"),
247
                                censored.indicator, ratio = 100, ...) {
248 2
  family <- match.arg(family)
249 0
  if (family == "gam") stop("The matrix interface is not available for gam")
250 2
  if (family == "gbm" && !requireNamespace("gbm", quietly = TRUE)) {
251 0
    stop("Pkg gbm needed for this function to work. Please install it.",
252 0
      call. = FALSE
253
    )
254
  }
255 2
  if (family == "glmnet" && !requireNamespace("glmnet", quietly = TRUE)) {
256 0
    stop("Pkg glmnet needed for this function to work. Please install it.",
257 0
      call. = FALSE
258
    )
259
  }
260

261
  # Default to linear term
262 2
  if (missing(formula_time)) {
263 2
    formula_time <- formula(paste("~", time))
264 2
    timeVar <- time
265
  } else {
266 2
    timeVar <- if (length(formula_time) == 3) {
267 0
      all.vars(formula_time[[3]])
268 2
    } else all.vars(formula_time)
269
  }
270
  # There should only be one time variable
271 2
  stopifnot(length(timeVar) == 1)
272

273
  # Try to infer event from data
274 2
  if (missing(event)) {
275 0
      varNames <- checkArgsTimeEvent(data = as.data.frame(y), time = timeVar)
276 0
      eventVar <- varNames$event
277
  } else {
278 2
      eventVar <- event
279
  }
280

281
    # gbm doesn't play nice with interactions and functions of time
282 2
    if (family == "gbm") {
283
        # So warn the user
284 2
        if (detect_nonlinear_time(formula_time, timeVar)) {
285 2
            warning(sprintf(paste("You may be using a nonlinear function",
286 2
                                  "of %s.\ngbm may throw an error."),
287 2
                            timeVar),
288 2
                    call. = FALSE)
289
        }
290 2
        if (detect_interaction(formula_time)) {
291 0
            warning("gbm may throw an error when using interaction terms",
292 0
                    call. = FALSE)
293
        }
294
    }
295

296 2
  typeEvents <- sort(unique(y[, eventVar]))
297
  # Call sampleCaseBase
298 2
  originalData <- list(
299 2
    "x" = x,
300 2
    "y" = y
301
  )
302 2
  class(originalData) <- c(class(originalData), "data.fit")
303 2
  if (missing(censored.indicator)) {
304 2
    sampleData <- sampleCaseBase(as.data.frame(cbind(y, x)),
305 2
      timeVar, eventVar,
306 2
      comprisk = (length(typeEvents) > 2),
307 2
      ratio
308
    )
309
  } else {
310 0
    sampleData <- sampleCaseBase(as.data.frame(cbind(y, x)),
311 0
      timeVar, eventVar,
312 0
      comprisk = (length(typeEvents) > 2),
313 0
      censored.indicator, ratio
314
    )
315
  }
316
  # Format everything into matrices and expand variables that need to be
317 2
  sample_event <- as.matrix(sampleData[, eventVar])
318 2
  sample_time <- if (family %in% c("glmnet", "gbm")) {
319 2
    model.matrix(
320 2
      update(formula_time, ~ . - 1),
321 2
      sampleData
322
    )
323
  } else {
324 2
    model.matrix(formula_time, sampleData)
325
  }
326 2
  sample_time_x <- cbind(
327 2
    sample_time,
328 2
    as.matrix(sampleData[, !names(sampleData) %in% c(eventVar, timeVar,
329 2
                                                     "offset")])
330
  )
331 2
  sample_offset <- sampleData$offset
332

333
  # Fit a binomial model if there are no competing risks
334 2
  if (length(typeEvents) == 2) {
335 2
    out <- switch(family,
336 2
      "glm" = glm.fit(sample_time_x, sample_event,
337 2
        family = binomial(),
338 2
        offset = sample_offset
339
      ),
340 2
      "glmnet" = cv.glmnet_offset_hack(sample_time_x, sample_event,
341 2
        family = "binomial",
342 2
        offset = sample_offset, ...
343
      ),
344 2
      "gbm" = gbm::gbm.fit(sample_time_x, sample_event,
345 2
        distribution = "bernoulli",
346 2
        offset = sample_offset,
347 2
        verbose = FALSE, ...
348
      )
349
    )
350

351 2
    out$originalData <- originalData
352 2
    out$typeEvents <- typeEvents
353 2
    out$timeVar <- timeVar
354 2
    out$eventVar <- eventVar
355 2
    out$matrix.fit <- TRUE
356 2
    out$formula_time <- formula_time
357 2
    out$offset <- sample_offset
358
    # Add new class
359 2
    class(out) <- c("singleEventCB", class(out))
360
  } else {
361 0
    stop("The matrix interface is not available for competing risks")
362
  }
363 2
  return(out)
364
}

Read our documentation on viewing source code .

Loading