1
#' @title Fit Linear Mixed Model with Lasso or Group Lasso Regularization
2
#' @description Main function to fit the linear mixed model with lasso or group
3
#'   lasso penalty for a sequence of tuning parameters. This is a penalized
4
#'   regression method that accounts for population structure using either the
5
#'   kinship matrix or the factored realized relationship matrix
6
#' @param x input matrix, of dimension n x p; where n is the number of
7
#'   observations and p are the number of predictors.
8
#' @param y response variable. must be a quantitative variable
9
#' @param D non-zero eigenvalues. This option is provided to the user should
10
#'   they decide or need to calculate the eigen decomposition of the kinship
11
#'   matrix or the singular value decomposition of the matrix of SNPs used to
12
#'   calculate the kinship outside of this function. This may occur, if for
13
#'   example, it is easier (e.g. because of memory issues, it's easier to
14
#'   calculate in plink). This should correspond to the non-zero eigenvalues
15
#'   only. Note that if you are doing an \code{svd} on the matrix of SNPs used
16
#'   to calculate the kinship matrix, then you must provide the square of the
17
#'   singular values so that they correspond to the eigenvalues of the kinship
18
#'   matrix. If you want to use the low rank estimation algorithm, you must
19
#'   provide the truncated eigenvalues and eigenvectors to the \code{D} and
20
#'   \code{U} arguments, respectively. If you want \code{ggmix} to truncate the
21
#'   eigenvectors and eigenvalues for low rank estimation, then provide either
22
#'   \code{K} or \code{kinship} instead and specify
23
#'   \code{n_nonzero_eigenvalues}.
24
#' @param U left singular vectors corresponding to the non-zero eigenvalues
25
#'   provided in the \code{D} argument.
26
#' @param kinship positive definite kinship matrix
27
#' @param K the matrix of SNPs used to determine the kinship matrix
28
#' @param n_nonzero_eigenvalues the number of nonzero eigenvalues. This argument
29
#'   is only used when \code{estimation="low"} and either \code{kinship} or
30
#'   \code{K} is provided. This argument will limit the function to finding the
31
#'   \code{n_nonzero_eigenvalues} largest eigenvalues. If \code{U} and \code{D}
32
#'   have been provided, then \code{n_nonzero_eigenvalues} defaults to the
33
#'   length of \code{D}.
34
#' @param n_zero_eigenvalues Currently not being used. Represents the number of
35
#'   zero eigenvalues. This argument must be specified when \code{U} and
36
#'   \code{D} are specified and \code{estimation="low"}. This is required for
37
#'   low rank estimation because the number of zero eigenvalues and their
38
#'   corresponding eigenvalues appears in the likelihood. In general this would
39
#'   be the rank of the matrix used to calculate the eigen or singular value
40
#'   decomposition. When \code{kinship} is provided and \code{estimation="low"}
41
#'   the default value will be \code{nrow(kinship) - n_nonzero_eigenvalues}.
42
#'   When \code{K} is provided and \code{estimation="low"}, the default value is
43
#'   \code{rank(K) - n_nonzero_eigenvalues}
44
#' @param estimation type of estimation. Currently only \code{type="full"} has
45
#'   been implemented.
46
#' @param penalty type of regularization penalty. Currently, only
47
#'   penalty="lasso" has been implemented.
48
#' @param group a vector of consecutive integers describing the grouping of the
49
#'   coefficients. Currently not implemented, but will be used when
50
#'   penalty="gglasso" is implemented.
51
#' @param dfmax limit the maximum number of variables in the model. Useful for
52
#'   very large \code{p} (the total number of predictors in the design matrix),
53
#'   if a partial path is desired. Default is the number of columns in the
54
#'   design matrix + 2 (for the variance components)
55
#' @param penalty.factor Separate penalty factors can be applied to each
56
#'   coefficient. This is a number that multiplies lambda to allow differential
57
#'   shrinkage. Can be 0 for some variables, which implies no shrinkage, and
58
#'   that variable is always included in the model. Default is 1 for all
59
#'   variables
60
#' @param lambda A user supplied lambda sequence (this is the tuning parameter).
61
#'   Typical usage is to have the program compute its own lambda sequence based
62
#'   on nlambda and lambda.min.ratio. Supplying a value of lambda overrides
63
#'   this. WARNING: use with care. Do not supply a single value for lambda (for
64
#'   predictions after CV use predict() instead). Supply instead a decreasing
65
#'   sequence of lambda values. glmnet relies on its warms starts for speed, and
66
#'   its often faster to fit a whole path than compute a single fit.
67
#' @param lambda_min_ratio Smallest value for lambda, as a fraction of
68
#'   lambda.max, the (data derived) entry value (i.e. the smallest value for
69
#'   which all coefficients are zero). The default depends on the sample size
70
#'   nobs relative to the number of variables nvars. If nobs > nvars, the
71
#'   default is 0.0001, close to zero. If nobs < nvars, the default is 0.01. A
72
#'   very small value of lambda.min.ratio will lead to a saturated fit in the
73
#'   nobs < nvars case.
74
#' @param nlambda the number of lambda values - default is 100.
75
#' @param eta_init initial value for the eta parameter, with \eqn{0 < \eta < 1}
76
#'   used in determining lambda.max and starting value for fitting algorithm.
77
#' @param maxit Maximum number of passes over the data for all lambda values;
78
#'   default is 10^2.
79
#' @param fdev Fractional deviance change threshold. If change in deviance
80
#'   between adjacent lambdas is less than fdev, the solution path stops early.
81
#'   factory default = 1.0e-5
82
#' @param alpha The elasticnet mixing parameter, with \eqn{0 \leq \alpha \leq
83
#'   1}. alpha=1 is the lasso penalty, and alpha=0 the ridge penalty.
84
#' @param thresh_glmnet Convergence threshold for coordinate descent for
85
#'   updating beta parameters. Each inner coordinate-descent loop continues
86
#'   until the maximum change in the objective after any coefficient update is
87
#'   less than thresh times the null deviance. Defaults value is 1E-7
88
#' @param standardize Logical flag for x variable standardization, prior to
89
#'   fitting the model sequence. The coefficients are always returned on the
90
#'   original scale. Default is standardize=FALSE. If variables are in the same
91
#'   units already, you might not wish to standardize.
92
#' @param epsilon Convergence threshold for block relaxation of the entire
93
#'   parameter vector \eqn{\Theta = ( \beta, \eta, \sigma^2 )}. The algorithm
94
#'   converges when \deqn{crossprod(\Theta_{j+1} - \Theta_{j}) < \epsilon}.
95
#'   Defaults value is 1E-4
96
#' @param verbose display progress. Can be either 0,1 or 2. 0 will not display
97
#'   any progress, 2 will display very detailed progress and 1 is somewhere in
98
#'   between. Default: 0.
99
#'
100
#' @examples
101
#' data(admixed)
102
#' fitlmm <- ggmix(x = admixed$xtrain, y = admixed$ytrain,
103
#'                 kinship = admixed$kin_train,
104
#'                 estimation = "full")
105
#' gicfit <- gic(fitlmm)
106
#' coef(gicfit, type = "nonzero")
107
#' predict(gicfit, newx = admixed$xtest)[1:5,,drop=FALSE]
108
#' plot(gicfit)
109
#' plot(fitlmm)
110
#'
111
#' @references Bhatnagar, Sahir R, Yang, Yi, Lu, Tianyuan, Schurr, Erwin,
112
#'   Loredo-Osti, JC, Forest, Marie, Oualkacha, Karim, and Greenwood, Celia MT.
113
#'   (2020) \emph{Simultaneous SNP selection and adjustment for population
114
#'   structure in high dimensional prediction models}
115
#'   \url{https://doi.org/10.1101/408484}
116
#'
117
#'   Friedman, J., Hastie, T. and Tibshirani, R. (2008) \emph{Regularization
118
#'   Paths for Generalized Linear Models via Coordinate Descent},
119
#'   \url{http://www.stanford.edu/~hastie/Papers/glmnet.pdf} \emph{Journal of
120
#'   Statistical Software, Vol. 33(1), 1-22 Feb 2010}
121
#'   \url{http://www.jstatsoft.org/v33/i01/}
122
#'
123
#'   Yang, Y., & Zou, H. (2015). A fast unified algorithm for solving
124
#'   group-lasso penalize learning problems. \emph{Statistics and Computing},
125
#'   25(6), 1129-1141.
126
#'   \url{http://www.math.mcgill.ca/yyang/resources/papers/gglasso.pdf}
127
#' @export
128
ggmix <- function(x, y,
129
                  U, D,
130
                  kinship,
131
                  K,
132
                  n_nonzero_eigenvalues,
133
                  n_zero_eigenvalues,
134
                  estimation = c("full"),
135
                  penalty = c("lasso"),
136
                  group,
137
                  penalty.factor = rep(1, p_design),
138
                  lambda = NULL,
139
                  lambda_min_ratio = ifelse(n_design < p_design, 0.01, 0.0001),
140
                  nlambda = 100,
141
                  eta_init = 0.5,
142
                  maxit = 100,
143
                  fdev = 1e-20,
144
                  standardize = FALSE,
145
                  alpha = 1, # elastic net mixing param. 1 is lasso, 0 is ridge
146
                  thresh_glmnet = 1e-8, # this is for glmnet
147
                  epsilon = 1e-4, # this is for ggmix
148
                  dfmax = p_design + 2,
149
                  verbose = 0) {
150 4
  this.call <- match.call()
151

152
  # Check input -------------------------------------------------------------
153

154 4
  estimation <- tryCatch(match.arg(estimation),
155 4
    error = function(c) {
156 0
      stop(strwrap("Estimation method should be
157 0
                   \"full\""),
158 0
        call. = FALSE
159
      )
160
    }
161
  )
162

163 4
  penalty <- tryCatch(match.arg(penalty),
164 4
    error = function(c) {
165 0
      stop(strwrap("Penalty type should be \"lasso\""),
166 0
        call. = FALSE
167
      )
168
    }
169
  )
170

171 4
  if (!is.matrix(x)) {
172 0
    stop("x has to be a matrix")
173
  }
174 4
  if (any(is.na(x))) {
175 0
    stop("Missing values in x not allowed.")
176
  }
177 4
  if (any(is.na(y))) {
178 0
    stop("Missing values in y not allowed.")
179
  }
180 4
  y <- drop(y)
181 4
  if (!is.numeric(y)) {
182 0
    stop("y must be numeric. Factors are not allowed.")
183
  }
184

185 4
  if ((!missing(U) & missing(D)) | (!missing(D) & missing(U))) {
186 0
    stop("both U and D must be specified.")
187
  }
188

189 4
  if (penalty == "gglasso" & missing(group)) {
190 0
    stop(strwrap("group cannot be missing when using the group lasso
191 0
                 penalty"))
192
  }
193

194 4
  np_design <- dim(x)
195 4
  if (is.null(np_design) | (np_design[2] <= 1)) {
196 0
    stop("x should be a matrix with 2 or more columns")
197
  }
198

199
  # note that p_design doesn't contain the intercept
200
  # whereas the x in the ggmix_object will have the intercept
201 4
  n_design <- np_design[[1]]
202 4
  p_design <- np_design[[2]]
203

204 4
  dfmax <- as.double(dfmax)
205

206 4
  is_kinship <- !missing(kinship)
207 4
  is_UD <- !missing(U) & !missing(D)
208 4
  is_K <- !missing(K)
209

210 4
  if (all(is_kinship, is_UD, is_K)) {
211 4
    warning(strwrap("kinship, U, D and K arguments have all been specified.
212 4
                    Only one of either kinship, U and D, or K need to be
213 4
                    specified. Ignoring U, D and K. Only using kinship in the
214 4
                    analysis"))
215 4
    is_UD <- FALSE
216 4
    is_K <- FALSE
217
  }
218

219 4
  if (all(is_kinship, is_UD, !is_K) | all(is_kinship, !is_UD, is_K)) {
220 0
    warning(strwrap("more than one of kinship, U, D or K arguments have
221 0
                    been specified. Only one of either kinship, U and D,
222 0
                    or K need to be specified. Only using kinship in the
223 0
                    analysis."))
224 0
    is_UD <- FALSE
225 0
    is_K <- FALSE
226
  }
227

228 4
  if (!is_kinship) {
229 0
    if (all(is_UD, is_K)) {
230 0
      warning(strwrap("U, D and K arguments have been specified. Only one of
231 0
                    either U and D, or K need to be specified. Only using
232 0
                    U and D in the analysis and ignoring K."))
233 0
      is_K <- FALSE
234
    }
235
  }
236

237 4
  if (!missing(n_nonzero_eigenvalues)) {
238 0
    n_nonzero_eigenvalues <- as.double(n_nonzero_eigenvalues)
239
  }
240

241 4
  if (!missing(n_zero_eigenvalues)) {
242 0
    n_zero_eigenvalues <- as.double(n_zero_eigenvalues)
243
  }
244

245

246
  # check kinship input -----------------------------------------------------
247

248 4
  if (is_kinship) {
249 4
    if (!is.matrix(kinship)) {
250 0
      stop("kinship has to be a matrix")
251
    }
252 4
    np_kinship <- dim(kinship)
253 4
    n_kinship <- np_kinship[[1]]
254 4
    p_kinship <- np_kinship[[2]]
255

256 4
    if (n_kinship != p_kinship) {
257 0
      stop("kinship should be a square matrix")
258
    }
259 4
    if (n_kinship != n_design) {
260 0
      stop(strwrap("number of rows in kinship matrix must equal the
261 0
                   number of rows in x matrix"))
262
    }
263

264 4
    if (estimation == "low") {
265 0
      if (missing(n_nonzero_eigenvalues)) {
266 0
        stop(strwrap("when kinship is specified and estimation=\"low\",
267 0
                     n_nonzero_eigenvalues must be specified."))
268
      }
269
    }
270

271 4
    if (missing(n_zero_eigenvalues) & estimation == "low") {
272 0
      n_zero_eigenvalues <- nrow(kinship) - n_nonzero_eigenvalues
273 0
      warning(sprintf(
274 0
        "n_zero_eigenvalues missing. setting to %g",
275 0
        n_zero_eigenvalues
276
      ))
277
    }
278

279 4
    corr_type <- "kinship"
280
  }
281

282

283
  # check K matrix input ----------------------------------------------------
284

285 4
  if (is_K) {
286 0
    if (!is.matrix(K)) {
287 0
      stop("K has to be a matrix")
288
    }
289 0
    np_K <- dim(K)
290 0
    n_K <- np_K[[1]]
291 0
    p_K <- np_K[[2]]
292

293 0
    if (n_K != n_design) {
294 0
      stop(strwrap("number of rows in K matrix must equal the
295 0
                   number of rows in x matrix"))
296
    }
297

298 0
    if (estimation == "low") {
299 0
      if (missing(n_nonzero_eigenvalues)) {
300 0
        stop(strwrap("when K is specified and estimation=\"low\",
301 0
                     n_nonzero_eigenvalues must be specified."))
302
      }
303
    }
304

305 0
    if (missing(n_zero_eigenvalues) & estimation == "low") {
306 0
      n_zero_eigenvalues <- min(n_K, p_K) - n_nonzero_eigenvalues
307 0
      warning(sprintf(
308 0
        "n_zero_eigenvalues missing. setting to %g",
309 0
        n_zero_eigenvalues
310
      ))
311
    }
312

313 0
    corr_type <- "K"
314
  }
315

316

317
  # check U and D input -----------------------------------------------------
318

319 4
  if (is_UD) {
320 0
    if (!is.matrix(U)) {
321 0
      stop("U has to be a matrix")
322
    }
323 0
    if (missing(n_zero_eigenvalues) & estimation == "low") {
324 0
      stop(strwrap("n_zero_eigenvalues must be specified when U and D have
325 0
                   been provided and estimation=\"low\". In general this would
326 0
                   be the rank of the matrix used to calculate the eigen or
327 0
                   singular value decomposition."))
328
    }
329 0
    np_U <- dim(U)
330 0
    n_U <- np_U[[1]]
331 0
    p_U <- np_U[[2]]
332

333 0
    D <- drop(D)
334 0
    if (!is.numeric(D)) stop(strwrap("D must be numeric"))
335 0
    p_D <- length(D)
336

337 0
    if (n_U != n_design) {
338 0
      stop(strwrap("number of rows in U matrix must equal the
339 0
                   number of rows in x matrix"))
340
    }
341 0
    if (p_U != p_D) {
342 0
      stop(strwrap("Length of D should equal the number of columns of U."))
343
    }
344

345 0
    if (missing(n_nonzero_eigenvalues)) {
346 0
      n_nonzero_eigenvalues <- p_D
347
    }
348

349 0
    corr_type <- "UD"
350
  }
351

352 4
  vnames <- colnames(x)
353 4
  if (is.null(vnames)) {
354 0
    vnames <- paste("V", seq(p_design), sep = "")
355 0
    colnames(x) <- vnames
356
  }
357

358 4
  if (!missing(penalty.factor)) {
359 0
    penalty.factor <- as.double(penalty.factor)
360 0
    if (length(penalty.factor) != p_design) {
361 0
      stop(strwrap("length of penalty.factor should be equal to number
362 0
                   of columns in x."))
363
    }
364
  }
365

366

367
  # Create ggmix objects ----------------------------------------------------
368

369 4
  if (estimation == "full") {
370 4
    ggmix_data_object <- switch(corr_type,
371 4
      kinship = new_fullrank_kinship(x = x, y = y, kinship = kinship),
372 4
      K = new_fullrank_K(x = x, y = y, K = K),
373 4
      UD = new_fullrank_UD(x = x, y = y, U = U, D = D)
374
    )
375
  }
376

377 4
  if (estimation == "low") {
378 0
    if (missing(n_nonzero_eigenvalues) | n_nonzero_eigenvalues <= 0) {
379 0
      stop(strwrap("n_nonzero_eigenvalues can't be missing, equal to 0 or negative
380 0
                 when low rank estimation is specified (estimation=\"low\").
381 0
                 Use estimation=\"full\" if you want all eigenvalues to be
382 0
                 computed."))
383
    }
384

385 0
    if (!requireNamespace("RSpectra", quietly = TRUE)) {
386 0
      stop(strwrap("Package \"RSpectra\" needed when low rank estimation is
387 0
                   specified (estimation=\"low\"). Please install it."),
388 0
        call. = FALSE
389
      )
390
    }
391

392 0
    ggmix_data_object <- switch(corr_type,
393 0
      kinship = new_lowrank_kinship(
394 0
        x = x, y = y,
395 0
        kinship = kinship,
396 0
        n_nonzero_eigenvalues = n_nonzero_eigenvalues,
397 0
        n_zero_eigenvalues = n_zero_eigenvalues
398
      ),
399 0
      K = new_lowrank_K(
400 0
        x = x, y = y, K = K,
401 0
        n_nonzero_eigenvalues = n_nonzero_eigenvalues,
402 0
        n_zero_eigenvalues = n_zero_eigenvalues
403
      ),
404 0
      UD = new_lowrank_UD(
405 0
        x = x, y = y, U = U, D = D,
406 0
        n_nonzero_eigenvalues = n_nonzero_eigenvalues,
407 0
        n_zero_eigenvalues = n_zero_eigenvalues
408
      )
409
    )
410
  }
411

412

413
  # fit linear mixed model --------------------------------------------------
414

415
  # browser()
416 4
  if (penalty == "lasso") {
417 4
    fit <- lmmlasso(ggmix_data_object,
418 4
      penalty.factor = penalty.factor,
419 4
      lambda = lambda,
420 4
      lambda_min_ratio = lambda_min_ratio,
421 4
      nlambda = nlambda,
422 4
      n_design = n_design,
423 4
      p_design = p_design,
424 4
      eta_init = eta_init,
425 4
      maxit = maxit,
426 4
      fdev = fdev,
427 4
      standardize = standardize,
428 4
      alpha = alpha, # elastic net mixing param. 1 is lasso, 0 is ridge
429 4
      thresh_glmnet = thresh_glmnet, # this is for glmnet
430 4
      epsilon = epsilon,
431 4
      dfmax = dfmax,
432 4
      verbose = verbose
433
    )
434 0
  } else if (penalty == "gglasso") {
435
    # not yet implemented
436
  }
437

438 4
  fit$call <- this.call
439

440 4
  return(fit)
441
}

Read our documentation on viewing source code .

Loading