1
#' Simulation Scenario from Bhatnagar et al. (2018+) ggmix paper
2
#'
3
#' @description Function that generates data of the different simulation studies
4
#'   presented in the accompanying paper. This function requires the
5
#'   \code{popkin} and \code{bnpsd} package to be installed.
6
#' @param n number of observations to simulate
7
#' @param p_design number of variables in X_test, i.e., the design matrix
8
#' @param p_kinship number of variable in X_kinship, i.e., matrix used to
9
#'   calculate kinship
10
#' @param k number of intermediate subpopulations.
11
#' @param s the desired bias coefficient, which specifies sigma indirectly.
12
#'   Required if sigma is missing
13
#' @param Fst The desired final FST of the admixed individuals. Required if
14
#'   sigma is missing
15
#' @param b0 the true intercept parameter
16
#' @param eta the true eta parameter, which has to be \code{0 < eta < 1}
17
#' @param sigma2 the true sigma2 parameter
18
#' @param nPC number of principal components to include in the design matrix
19
#'   used for regression adjustment for population structure via principal
20
#'   components. This matrix is used as the input in a standard lasso regression
21
#'   routine, where there are no random effects.
22
#' @param geography the type of geography for simulation the kinship matrix.
23
#'   "ind" is independent populations where every individuals is actually
24
#'   unadmixed, "1d" is a 1D geography and "circ" is circular geography.
25
#'   Default: "ind". See the functions in the \code{bnpsd} for details on how
26
#'   this data is actually generated.
27
#' @param percent_causal percentage of \code{p_design} that is causal. must be
28
#'   \eqn{0 \leq percent_causal \leq 1}. The true regression coefficients are
29
#'   generated from a standard normal distribution.
30
#' @param percent_overlap this represents the percentage of causal SNPs that
31
#'   will also be included in the calculation of the kinship matrix
32
#' @param train_tune_test the proportion of sample size used for training tuning
33
#'   parameter selection and testing. default is 60/20/20 split
34
#' @details The kinship is estimated using the \code{popkin} function from the
35
#'   \code{popkin} package. This function will multiple that kinship matrix by 2
36
#'   to give the expected covariance matrix which is subsequently used in the
37
#'   linear mixed models
38
#' @return A list with the following elements \describe{\item{ytrain}{simulated
39
#'   response vector for training set} \item{ytune}{simulated response vector
40
#'   for tuning parameter selection set} \item{ytest}{simulated response vector
41
#'   for test set} \item{xtrain}{simulated design matrix for training
42
#'   set}\item{xtune}{simulated design matrix for tuning parameter selection
43
#'   set}\item{xtest}{simulated design matrix for testing set}
44
#'   \item{xtrain_lasso}{simulated design matrix for training set for lasso
45
#'   model. This is the same as xtrain, but also includes the nPC principal
46
#'   components} \item{xtune_lasso}{simulated design matrix for tuning parameter
47
#'   selection set for lasso model. This is the same as xtune, but also includes
48
#'   the nPC principal components}\item{xtest}{simulated design matrix for
49
#'   testing set for lasso model. This is the same as xtest, but also includes
50
#'   the nPC principal components} \item{causal}{character vector of the names
51
#'   of the causal SNPs} \item{beta}{the vector of true regression coefficients}
52
#'   \item{kin_train}{2 times the estimated kinship for the training set
53
#'   individuals} \item{kin_tune_train}{The covariance matrix between the tuning
54
#'   set and the training set individuals} \item{kin_test_train}{The covariance
55
#'   matrix between the test set and training set individuals}
56
#'   \item{Xkinship}{the matrix of SNPs used to estimate the kinship matrix}
57
#'   \item{not_causal}{character vector of the non-causal SNPs} \item{PC}{the
58
#'   principal components for population structure adjustment} }
59
#' @seealso \code{\link[bnpsd]{admix_prop_1d_linear}}
60
#' @export
61
#' @examples
62
#' admixed <- gen_structured_model(n = 100,
63
#'                                 p_design = 50,
64
#'                                 p_kinship = 5e2,
65
#'                                 geography = "1d",
66
#'                                 percent_causal = 0.10,
67
#'                                 percent_overlap = "100",
68
#'                                 k = 5, s = 0.5, Fst = 0.1,
69
#'                                 b0 = 0, nPC = 10,
70
#'                                 eta = 0.1, sigma2 = 1,
71
#'                                 train_tune_test = c(0.8, 0.1, 0.1))
72
#' names(admixed)
73
gen_structured_model <- function(n, p_design, p_kinship, k, s, Fst, b0, nPC = 10,
74
                                 eta, sigma2, geography = c("ind", "1d", "circ"),
75
                                 percent_causal, percent_overlap, train_tune_test = c(0.6, 0.2, 0.2)) {
76

77
  # p_design:
78
  # p_kinship:
79
  # k:	N
80
  # s:
81
  # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations,
82
  # up to a scaling factor (which cancels out in calculations). Required if sigma is missing
83
  # Fst: T
84
  # browser()
85
  # define population structure
86

87 0
  if(sum(train_tune_test) != 1) stop("Training/tune/test split must be equal to 1")
88

89 4
  if (!requireNamespace("bnpsd", quietly = TRUE)) {
90 0
    stop(strwrap("Package \"bnpsd\" needed to simulate data.
91 0
                 Please install it."),
92 0
      call. = FALSE
93
    )
94
  }
95

96
  # if (!requireNamespace("simtrait", quietly = TRUE)) {
97
  #   stop(strwrap("Package \"simtrait\" needed to simulate data.
98
  #                Please install it."),
99
  #        call. = FALSE
100
  #   )
101
  # }
102

103 4
  if (!requireNamespace("popkin", quietly = TRUE)) {
104 0
    stop(strwrap("Package \"popkin\" needed to simulate data.
105 0
                 Please install it."),
106 0
      call. = FALSE
107
    )
108
  }
109

110

111
  ########################
112
  ### Generate Kinship ###
113
  ########################
114

115
  # FF <- 1:k # subpopulation FST vector, up to a scalar
116
  # s <- 0.5 # desired bias coefficient
117
  # Fst <- 0.1 # desired FST for the admixed individuals
118 4
  geography <- match.arg(geography)
119 4
  if (geography == "1d") {
120 4
    FF <- 1:k # subpopulation FST vector, up to a scalar
121 4
    obj <- bnpsd::admix_prop_1d_linear(n_ind = n,
122 4
                                       k_subpops = k,
123 4
                                       bias_coeff = s,
124 4
                                       coanc_subpops = FF,
125 4
                                       fst = Fst)
126
    # Q <- obj$Q
127
    # FF <- obj$F
128 4
    admix_proportions <- obj$admix_proportions
129
    # rescaled inbreeding vector for intermediate subpopulations
130 4
    inbr_subpops <- obj$coanc_subpops
131

132
    # get pop structure parameters of the admixed individuals
133 4
    coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
134 4
    kinship <- bnpsd::coanc_to_kinship(coancestry)
135

136 4
  } else if (geography == "ind") {
137 4
    ngroup <- n / k # equal sized groups
138
    # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3)
139 4
    labs <- rep(paste0("S", 1:k), each = ngroup)
140
    # data dimensions infered from labs:
141 4
    length(labs) # number of individuals "n"
142
    # desired admixture matrix ("is" stands for "Independent Subpopulations")
143
    # number of subpopulations "k_subpops"
144 4
    k_subpops <- length(unique(labs))
145

146
    # desired admixture matrix
147 4
    admix_proportions <- bnpsd::admix_prop_indep_subpops(labs)
148

149
    # subpopulation FST vector, unnormalized so far
150 4
    inbr_subpops <- 1 : k_subpops
151
    # normalized to have the desired FST
152
    # NOTE fst is a function in the `popkin` package
153 4
    inbr_subpops <- inbr_subpops / popkin::fst(inbr_subpops) * Fst
154
    # verify FST for the intermediate subpopulations
155
    # fst(inbr_subpops)
156
    #> [1] 0.2
157

158
    # get coancestry of the admixed individuals
159 4
    coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
160
    # before getting FST for individuals, weigh then inversely proportional to subpop sizes
161 4
    weights <- popkin::weights_subpops(labs) # function from `popkin` package
162

163 4
    kinship <- bnpsd::coanc_to_kinship(coancestry)
164

165 4
  } else if (geography == "circ") {
166 4
    FF <- 1:k # subpopulation FST vector, up to a scalar
167
    # obj <- bnpsd::admix_prop_1d_circular(n_ind = n, k_subpops = k, s = s, F = FF, Fst = Fst)
168
    # Q <- obj$Q
169
    # FF <- obj$F
170

171
    # admixture proportions from *circular* 1D geography
172 4
    obj <- bnpsd::admix_prop_1d_circular(
173 4
      n_ind = n,
174 4
      k_subpops = k,
175 4
      bias_coeff = s,
176 4
      coanc_subpops = FF,
177 4
      fst = Fst
178
    )
179 4
    admix_proportions <- obj$admix_proportions
180 4
    inbr_subpops <- obj$coanc_subpops
181

182
    # get pop structure parameters of the admixed individuals
183 4
    coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
184

185 4
    kinship <- bnpsd::coanc_to_kinship(coancestry)
186
  }
187

188

189
  #####################
190
  ### GENERATE SNPS ###
191
  #####################
192

193 4
  ncausal <- p_design * percent_causal
194
  # browser()
195 4
  if (percent_overlap == "100") {
196

197 4
    total_snps_to_simulate <- p_design + p_kinship - ncausal
198

199
    # this contains all SNPs (X_{Design}:X_{kinship})
200
    # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate)
201

202
    # draw all random Allele Freqs (AFs) and genotypes
203
    # reuse the previous inbr_subpops, admix_proportions
204 4
    out <- bnpsd::draw_all_admix(
205 4
      admix_proportions = admix_proportions,
206 4
      inbr_subpops = inbr_subpops,
207 4
      m_loci = total_snps_to_simulate,
208
      # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them
209 4
      want_p_subpops = TRUE,
210 4
      want_p_ind = TRUE
211
    )
212

213 4
    Xall <- t(out$X) # genotypes are columns, rows are subjects
214 4
    cnames <- paste0("X", 1:total_snps_to_simulate)
215 4
    colnames(Xall) <- cnames
216 4
    rownames(Xall) <- paste0("id", 1:n)
217 4
    Xall[1:5,1:5]
218 4
    dim(Xall)
219 4
    subpops <- ceiling( (1:n)/n*k )
220 4
    table(subpops) # got k=10 subpops with 100 individuals each
221

222
    ###################
223
    ### CAUSAL LOCI ###
224
    ###################
225
    # browser()
226
    # Snps used for kinship
227 4
    snps_kinships <- sample(cnames, p_kinship, replace = FALSE)
228

229
    # all causal snps are in kinship matrix
230 4
    if (percent_causal != 0 ) {
231
      # browser()
232
      # compute marginal allele frequencies
233 4
      p_anc_hat <- colMeans(Xall[,snps_kinships,drop=FALSE], na.rm = TRUE)/2
234

235
      # select random SNPs! this performs the magic...
236
      # also runs additional checks
237 4
      causal_indexes <- select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05)
238

239
      # draw random SNP coefficients for selected loci
240
      # causal_coeffs <- stats::rnorm(ncausal, 0, 1)
241

242 4
      causal <- snps_kinships[causal_indexes]
243 4
      snps_design <- c(setdiff(cnames, snps_kinships), causal)
244 4
      not_causal <- setdiff(snps_design, causal)
245

246
      # subset data to consider causal loci only
247 4
      p_anc_hat <- p_anc_hat[causal_indexes]
248

249 0
    } else if (percent_causal == 0) {
250 0
      causal <- ""
251 0
      snps_design <- setdiff(cnames, snps_kinships)
252 0
      not_causal <- snps_design
253
    }
254

255 4
    Xkinship <- Xall[,snps_kinships]
256 4
    Xdesign <- Xall[,snps_design]
257

258
    # Xdesign_causal <- Xall[,causal_indexes,drop=F] # the subset of causal data (keep as a matrix even if m_causal == 1)
259

260
    # now estimate kinship using popkin
261 4
    PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE)
262 4
    mean_kinship <- mean(PhiHat)
263
    # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE)
264

265 0
  } else if (percent_overlap == "0") {
266

267 0
    total_snps_to_simulate <- p_design + p_kinship
268

269
    # this contains all SNPs (X_{Testing}:X_{kinship})
270
    # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate)
271

272
    # draw all random Allele Freqs (AFs) and genotypes
273
    # reuse the previous inbr_subpops, admix_proportions
274 0
    out <- bnpsd::draw_all_admix(
275 0
      admix_proportions = admix_proportions,
276 0
      inbr_subpops = inbr_subpops,
277 0
      m_loci = total_snps_to_simulate,
278
      # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them
279 0
      want_p_subpops = TRUE,
280 0
      want_p_ind = TRUE
281
    )
282

283

284 0
    Xall <- t(out$X) # genotypes are columns, rows are subjects
285 0
    cnames <- paste0("X", 1:total_snps_to_simulate)
286 0
    colnames(Xall) <- cnames
287 0
    rownames(Xall) <- paste0("id", 1:n)
288 0
    Xall[1:5,1:5]
289 0
    dim(Xall)
290 0
    subpops <- ceiling( (1:n)/n*k )
291 0
    table(subpops) # got k=10 subpops with 100 individuals each
292

293

294
    # Snps used for kinship
295 0
    snps_kinships <- sample(cnames, p_kinship, replace = FALSE)
296 0
    length(snps_kinships)
297

298 0
    snps_design <- setdiff(cnames, snps_kinships)
299
    # length(snps_design)
300
    # setdiff(cnames, snps_kinships) %>% length()
301 0
    if (percent_causal !=0) {
302

303
      # compute marginal allele frequencies
304 0
      p_anc_hat <- colMeans(Xall[,snps_design, drop=FALSE], na.rm = TRUE)/2
305

306
      # select random SNPs! this performs the magic...
307
      # also runs additional checks
308 0
      causal_indexes <- select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05)
309

310
      # draw random SNP coefficients for selected loci
311
      # causal_coeffs <- stats::rnorm(ncausal, 0, 1)
312

313 0
      causal <- snps_design[causal_indexes]
314
      # causal <- sample(snps_design, ncausal, replace = FALSE)
315 0
    } else if (percent_causal == 0) {
316 0
      causal <- ""
317
    }
318

319 0
    not_causal <- setdiff(snps_design, causal)
320

321 0
    Xkinship <- Xall[,snps_kinships]
322 0
    Xdesign <- Xall[,snps_design]
323

324
    # now estimate kinship using popkin
325 0
    PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE)
326
    # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE)
327

328
  }
329

330 4
  np <- dim(Xdesign)
331 4
  n <- np[[1]]
332 4
  p <- np[[2]]
333

334 4
  beta <- rep(0, length = p)
335 4
  if (percent_causal != 0) {
336 4
    beta[which(colnames(Xdesign) %in% causal)] <- stats::rnorm(n = length(causal))
337
  }
338

339 4
  mu <- as.numeric(Xdesign %*% beta)
340

341
  # y <- trait
342 4
  kin <- 2 * PhiHat
343

344 4
  tt <- eta * sigma2 * kin
345 4
  if (!all(eigen(tt)$values > 0)) {
346 4
    message("eta * sigma2 * kin not PD, using Matrix::nearPD")
347 4
    tt <- Matrix::nearPD(tt)$mat
348
  }
349

350 4
  P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt)
351 4
  E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n))
352
  # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim)
353
  # y <- b0 + mu + t(P) + t(E)
354
  # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n))
355 4
  y <- b0 + mu + P + E
356

357
  # partition the data into train/tune/test
358 4
  spec = c(train = train_tune_test[1], tune = train_tune_test[2], test = train_tune_test[3])
359

360 4
  g = sample(cut(
361 4
    seq(nrow(Xdesign)),
362 4
    nrow(Xdesign)*cumsum(c(0,spec)),
363 4
    labels = names(spec)
364
  ))
365
  # g %>% table
366

367 4
  train_ind <- which(g == "train")
368 4
  tune_ind <- which(g == "tune")
369 4
  test_ind <- which(g == "test")
370
  # res = split(admixed$x, g)
371

372 4
  xtrain <- Xdesign[train_ind,,drop=FALSE]
373 4
  xtune <- Xdesign[tune_ind,,drop=FALSE]
374 4
  xtest <- Xdesign[test_ind,,drop=FALSE]
375

376 4
  ytrain <- y[train_ind]
377 4
  ytune <- y[tune_ind]
378 4
  ytest <- y[test_ind]
379

380 4
  kin_train <- kin[train_ind,train_ind]
381 4
  kin_tune_train <- kin[tune_ind,train_ind]
382 4
  kin_test_train <- kin[test_ind,train_ind]
383

384
  # xtrain <- Xdesign[ind,,drop=FALSE]
385
  # xtune <- Xdesign[-ind,,drop=FALSE]
386 4
  PC_all <- stats::prcomp(Xdesign)
387 4
  PC <- stats::prcomp(xtrain)
388 4
  xtrain_lasso <- cbind(xtrain, PC$x[,1:nPC])
389 4
  xtune_pc <- stats::predict(PC, newdata = xtune)
390 4
  xtune_lasso <- cbind(xtune, xtune_pc[,1:nPC])
391 4
  xtest_pc <- stats::predict(PC, newdata = xtest)
392 4
  xtest_lasso <- cbind(xtest, xtest_pc[,1:nPC])
393

394 4
  mu_train <- mu[train_ind]
395

396

397 4
  return(list(ytrain = ytrain,
398 4
              ytune = ytune,
399 4
              ytest = ytest,
400

401 4
              xtrain = xtrain,
402 4
              xtune = xtune,
403 4
              xtest = xtest,
404

405 4
              xtrain_lasso = xtrain_lasso,
406 4
              xtune_lasso = xtune_lasso,
407 4
              xtest_lasso = xtest_lasso,
408

409 4
              Xkinship = Xall[train_ind,snps_kinships, drop = F],
410 4
              kin_train = kin_train,
411 4
              kin_tune_train = kin_tune_train,
412 4
              kin_test_train = kin_test_train, # covaraince between train and test
413

414 4
              mu_train = mu_train, # Xbeta for training set
415

416 4
              causal = causal,
417 4
              beta = beta,
418

419 4
              not_causal = not_causal,
420

421
              # used in manuscript to generate simulation study figures
422 4
              kinship = kinship,
423 4
              coancestry = coancestry,
424 4
              PC = PC_all$x[,1:nPC],
425 4
              subpops = subpops
426
  ))
427
}
428

429

430 0
l2norm <- function(x) sqrt(sum(x^2))
431

432

433
"%ni%" <- Negate("%in%")
434

435
# internal function
436
# taken verbatim from glmnet package.
437
# it used to be exported by glmnet, but is no longer exported.
438
lambda.interp <- function (lambda, s) {
439 4
  if (length(lambda) == 1) {
440 0
    nums = length(s)
441 0
    left = rep(1, nums)
442 0
    right = left
443 0
    sfrac = rep(1, nums)
444
  }
445
  else {
446 4
    s[s > max(lambda)] = max(lambda)
447 4
    s[s < min(lambda)] = min(lambda)
448 4
    k = length(lambda)
449 4
    sfrac <- (lambda[1] - s)/(lambda[1] - lambda[k])
450 4
    lambda <- (lambda[1] - lambda)/(lambda[1] - lambda[k])
451 4
    coord <- stats::approx(lambda, seq(lambda), sfrac)$y
452 4
    left <- floor(coord)
453 4
    right <- ceiling(coord)
454 4
    sfrac = (sfrac - lambda[right])/(lambda[left] - lambda[right])
455 4
    sfrac[left == right] = 1
456 4
    sfrac[abs(lambda[left] - lambda[right]) < .Machine$double.eps] = 1
457
  }
458 4
  list(left = left, right = right, frac = sfrac)
459
}
460

461

462

463
# internal function
464
# taken verbatim from glmnet package.
465
# it used to be exported by glmnet, but is no longer exported.
466
nonzeroCoef <- function (beta, bystep = FALSE) {
467 4
  nr = nrow(beta)
468 4
  if (nr == 1) {
469 0
    if (bystep)
470 0
      apply(beta, 2, function(x) if (abs(x) > 0)
471 0
        1
472 0
        else NULL)
473
    else {
474 0
      if (any(abs(beta) > 0))
475 0
        1
476 0
      else NULL
477
    }
478
  }
479
  else {
480 4
    beta = abs(beta) > 0
481 4
    which = seq(nr)
482 4
    ones = rep(1, ncol(beta))
483 4
    nz = as.vector((beta %*% ones) > 0)
484 4
    which = which[nz]
485 4
    if (bystep) {
486 4
      if (length(which) > 0) {
487 4
        beta = as.matrix(beta[which, , drop = FALSE])
488 4
        nzel = function(x, which) if (any(x))
489 4
          which[x]
490 4
        else NULL
491 4
        which = apply(beta, 2, nzel, which)
492 4
        if (!is.list(which))
493 4
          which = data.frame(which)
494 4
        which
495
      }
496
      else {
497 0
        dn = dimnames(beta)[[2]]
498 0
        which = vector("list", length(dn))
499 0
        names(which) = dn
500 0
        which
501
      }
502
    }
503 4
    else which
504
  }
505
}
506

507

508

509
# internal function
510
# taken verbatim from https://github.com/OchoaLab/simtrait/blob/master/R/select_loci.R
511
select_loci <- function(maf, m_causal, maf_cut = 0.05) {
512
  # check for missing parameters
513 4
  if (missing(maf))
514 0
    stop('marginal allele frequency vector `maf` is required!')
515 4
  if (missing(m_causal))
516 0
    stop('the number of causal loci `m_causal` is required!')
517

518
  # data dimensions
519 4
  m <- length(maf)
520
  # other checks
521 4
  if (m_causal > m)
522 0
    stop('the number of causal loci cannot be larger than the total number of loci (', m_causal, ' > ', m, ')')
523

524
  # select random loci!
525
  # we might not want to pick extremely rare alleles, so set MAF thresholds
526 4
  i <- which(maf_cut <= maf & maf <= 1 - maf_cut) # candidate locus indexes
527 4
  sample(i, m_causal) # these are the chosen locus indeces!
528
}

Read our documentation on viewing source code .

Loading