1
|
|
#' Simulation Scenario from Bhatnagar et al. (2018+) ggmix paper
|
2
|
|
#'
|
3
|
|
#' @description Function that generates data of the different simulation studies
|
4
|
|
#' presented in the accompanying paper. This function requires the
|
5
|
|
#' \code{popkin} and \code{bnpsd} package to be installed.
|
6
|
|
#' @param n number of observations to simulate
|
7
|
|
#' @param p_design number of variables in X_test, i.e., the design matrix
|
8
|
|
#' @param p_kinship number of variable in X_kinship, i.e., matrix used to
|
9
|
|
#' calculate kinship
|
10
|
|
#' @param k number of intermediate subpopulations.
|
11
|
|
#' @param s the desired bias coefficient, which specifies sigma indirectly.
|
12
|
|
#' Required if sigma is missing
|
13
|
|
#' @param Fst The desired final FST of the admixed individuals. Required if
|
14
|
|
#' sigma is missing
|
15
|
|
#' @param b0 the true intercept parameter
|
16
|
|
#' @param eta the true eta parameter, which has to be \code{0 < eta < 1}
|
17
|
|
#' @param sigma2 the true sigma2 parameter
|
18
|
|
#' @param nPC number of principal components to include in the design matrix
|
19
|
|
#' used for regression adjustment for population structure via principal
|
20
|
|
#' components. This matrix is used as the input in a standard lasso regression
|
21
|
|
#' routine, where there are no random effects.
|
22
|
|
#' @param geography the type of geography for simulation the kinship matrix.
|
23
|
|
#' "ind" is independent populations where every individuals is actually
|
24
|
|
#' unadmixed, "1d" is a 1D geography and "circ" is circular geography.
|
25
|
|
#' Default: "ind". See the functions in the \code{bnpsd} for details on how
|
26
|
|
#' this data is actually generated.
|
27
|
|
#' @param percent_causal percentage of \code{p_design} that is causal. must be
|
28
|
|
#' \eqn{0 \leq percent_causal \leq 1}. The true regression coefficients are
|
29
|
|
#' generated from a standard normal distribution.
|
30
|
|
#' @param percent_overlap this represents the percentage of causal SNPs that
|
31
|
|
#' will also be included in the calculation of the kinship matrix
|
32
|
|
#' @param train_tune_test the proportion of sample size used for training tuning
|
33
|
|
#' parameter selection and testing. default is 60/20/20 split
|
34
|
|
#' @details The kinship is estimated using the \code{popkin} function from the
|
35
|
|
#' \code{popkin} package. This function will multiple that kinship matrix by 2
|
36
|
|
#' to give the expected covariance matrix which is subsequently used in the
|
37
|
|
#' linear mixed models
|
38
|
|
#' @return A list with the following elements \describe{\item{ytrain}{simulated
|
39
|
|
#' response vector for training set} \item{ytune}{simulated response vector
|
40
|
|
#' for tuning parameter selection set} \item{ytest}{simulated response vector
|
41
|
|
#' for test set} \item{xtrain}{simulated design matrix for training
|
42
|
|
#' set}\item{xtune}{simulated design matrix for tuning parameter selection
|
43
|
|
#' set}\item{xtest}{simulated design matrix for testing set}
|
44
|
|
#' \item{xtrain_lasso}{simulated design matrix for training set for lasso
|
45
|
|
#' model. This is the same as xtrain, but also includes the nPC principal
|
46
|
|
#' components} \item{xtune_lasso}{simulated design matrix for tuning parameter
|
47
|
|
#' selection set for lasso model. This is the same as xtune, but also includes
|
48
|
|
#' the nPC principal components}\item{xtest}{simulated design matrix for
|
49
|
|
#' testing set for lasso model. This is the same as xtest, but also includes
|
50
|
|
#' the nPC principal components} \item{causal}{character vector of the names
|
51
|
|
#' of the causal SNPs} \item{beta}{the vector of true regression coefficients}
|
52
|
|
#' \item{kin_train}{2 times the estimated kinship for the training set
|
53
|
|
#' individuals} \item{kin_tune_train}{The covariance matrix between the tuning
|
54
|
|
#' set and the training set individuals} \item{kin_test_train}{The covariance
|
55
|
|
#' matrix between the test set and training set individuals}
|
56
|
|
#' \item{Xkinship}{the matrix of SNPs used to estimate the kinship matrix}
|
57
|
|
#' \item{not_causal}{character vector of the non-causal SNPs} \item{PC}{the
|
58
|
|
#' principal components for population structure adjustment} }
|
59
|
|
#' @seealso \code{\link[bnpsd]{admix_prop_1d_linear}}
|
60
|
|
#' @export
|
61
|
|
#' @examples
|
62
|
|
#' admixed <- gen_structured_model(n = 100,
|
63
|
|
#' p_design = 50,
|
64
|
|
#' p_kinship = 5e2,
|
65
|
|
#' geography = "1d",
|
66
|
|
#' percent_causal = 0.10,
|
67
|
|
#' percent_overlap = "100",
|
68
|
|
#' k = 5, s = 0.5, Fst = 0.1,
|
69
|
|
#' b0 = 0, nPC = 10,
|
70
|
|
#' eta = 0.1, sigma2 = 1,
|
71
|
|
#' train_tune_test = c(0.8, 0.1, 0.1))
|
72
|
|
#' names(admixed)
|
73
|
|
gen_structured_model <- function(n, p_design, p_kinship, k, s, Fst, b0, nPC = 10,
|
74
|
|
eta, sigma2, geography = c("ind", "1d", "circ"),
|
75
|
|
percent_causal, percent_overlap, train_tune_test = c(0.6, 0.2, 0.2)) {
|
76
|
|
|
77
|
|
# p_design:
|
78
|
|
# p_kinship:
|
79
|
|
# k: N
|
80
|
|
# s:
|
81
|
|
# F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations,
|
82
|
|
# up to a scaling factor (which cancels out in calculations). Required if sigma is missing
|
83
|
|
# Fst: T
|
84
|
|
# browser()
|
85
|
|
# define population structure
|
86
|
|
|
87
|
0
|
if(sum(train_tune_test) != 1) stop("Training/tune/test split must be equal to 1")
|
88
|
|
|
89
|
4
|
if (!requireNamespace("bnpsd", quietly = TRUE)) {
|
90
|
0
|
stop(strwrap("Package \"bnpsd\" needed to simulate data.
|
91
|
0
|
Please install it."),
|
92
|
0
|
call. = FALSE
|
93
|
|
)
|
94
|
|
}
|
95
|
|
|
96
|
|
# if (!requireNamespace("simtrait", quietly = TRUE)) {
|
97
|
|
# stop(strwrap("Package \"simtrait\" needed to simulate data.
|
98
|
|
# Please install it."),
|
99
|
|
# call. = FALSE
|
100
|
|
# )
|
101
|
|
# }
|
102
|
|
|
103
|
4
|
if (!requireNamespace("popkin", quietly = TRUE)) {
|
104
|
0
|
stop(strwrap("Package \"popkin\" needed to simulate data.
|
105
|
0
|
Please install it."),
|
106
|
0
|
call. = FALSE
|
107
|
|
)
|
108
|
|
}
|
109
|
|
|
110
|
|
|
111
|
|
########################
|
112
|
|
### Generate Kinship ###
|
113
|
|
########################
|
114
|
|
|
115
|
|
# FF <- 1:k # subpopulation FST vector, up to a scalar
|
116
|
|
# s <- 0.5 # desired bias coefficient
|
117
|
|
# Fst <- 0.1 # desired FST for the admixed individuals
|
118
|
4
|
geography <- match.arg(geography)
|
119
|
4
|
if (geography == "1d") {
|
120
|
4
|
FF <- 1:k # subpopulation FST vector, up to a scalar
|
121
|
4
|
obj <- bnpsd::admix_prop_1d_linear(n_ind = n,
|
122
|
4
|
k_subpops = k,
|
123
|
4
|
bias_coeff = s,
|
124
|
4
|
coanc_subpops = FF,
|
125
|
4
|
fst = Fst)
|
126
|
|
# Q <- obj$Q
|
127
|
|
# FF <- obj$F
|
128
|
4
|
admix_proportions <- obj$admix_proportions
|
129
|
|
# rescaled inbreeding vector for intermediate subpopulations
|
130
|
4
|
inbr_subpops <- obj$coanc_subpops
|
131
|
|
|
132
|
|
# get pop structure parameters of the admixed individuals
|
133
|
4
|
coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
|
134
|
4
|
kinship <- bnpsd::coanc_to_kinship(coancestry)
|
135
|
|
|
136
|
4
|
} else if (geography == "ind") {
|
137
|
4
|
ngroup <- n / k # equal sized groups
|
138
|
|
# here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3)
|
139
|
4
|
labs <- rep(paste0("S", 1:k), each = ngroup)
|
140
|
|
# data dimensions infered from labs:
|
141
|
4
|
length(labs) # number of individuals "n"
|
142
|
|
# desired admixture matrix ("is" stands for "Independent Subpopulations")
|
143
|
|
# number of subpopulations "k_subpops"
|
144
|
4
|
k_subpops <- length(unique(labs))
|
145
|
|
|
146
|
|
# desired admixture matrix
|
147
|
4
|
admix_proportions <- bnpsd::admix_prop_indep_subpops(labs)
|
148
|
|
|
149
|
|
# subpopulation FST vector, unnormalized so far
|
150
|
4
|
inbr_subpops <- 1 : k_subpops
|
151
|
|
# normalized to have the desired FST
|
152
|
|
# NOTE fst is a function in the `popkin` package
|
153
|
4
|
inbr_subpops <- inbr_subpops / popkin::fst(inbr_subpops) * Fst
|
154
|
|
# verify FST for the intermediate subpopulations
|
155
|
|
# fst(inbr_subpops)
|
156
|
|
#> [1] 0.2
|
157
|
|
|
158
|
|
# get coancestry of the admixed individuals
|
159
|
4
|
coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
|
160
|
|
# before getting FST for individuals, weigh then inversely proportional to subpop sizes
|
161
|
4
|
weights <- popkin::weights_subpops(labs) # function from `popkin` package
|
162
|
|
|
163
|
4
|
kinship <- bnpsd::coanc_to_kinship(coancestry)
|
164
|
|
|
165
|
4
|
} else if (geography == "circ") {
|
166
|
4
|
FF <- 1:k # subpopulation FST vector, up to a scalar
|
167
|
|
# obj <- bnpsd::admix_prop_1d_circular(n_ind = n, k_subpops = k, s = s, F = FF, Fst = Fst)
|
168
|
|
# Q <- obj$Q
|
169
|
|
# FF <- obj$F
|
170
|
|
|
171
|
|
# admixture proportions from *circular* 1D geography
|
172
|
4
|
obj <- bnpsd::admix_prop_1d_circular(
|
173
|
4
|
n_ind = n,
|
174
|
4
|
k_subpops = k,
|
175
|
4
|
bias_coeff = s,
|
176
|
4
|
coanc_subpops = FF,
|
177
|
4
|
fst = Fst
|
178
|
|
)
|
179
|
4
|
admix_proportions <- obj$admix_proportions
|
180
|
4
|
inbr_subpops <- obj$coanc_subpops
|
181
|
|
|
182
|
|
# get pop structure parameters of the admixed individuals
|
183
|
4
|
coancestry <- bnpsd::coanc_admix(admix_proportions, inbr_subpops)
|
184
|
|
|
185
|
4
|
kinship <- bnpsd::coanc_to_kinship(coancestry)
|
186
|
|
}
|
187
|
|
|
188
|
|
|
189
|
|
#####################
|
190
|
|
### GENERATE SNPS ###
|
191
|
|
#####################
|
192
|
|
|
193
|
4
|
ncausal <- p_design * percent_causal
|
194
|
|
# browser()
|
195
|
4
|
if (percent_overlap == "100") {
|
196
|
|
|
197
|
4
|
total_snps_to_simulate <- p_design + p_kinship - ncausal
|
198
|
|
|
199
|
|
# this contains all SNPs (X_{Design}:X_{kinship})
|
200
|
|
# out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate)
|
201
|
|
|
202
|
|
# draw all random Allele Freqs (AFs) and genotypes
|
203
|
|
# reuse the previous inbr_subpops, admix_proportions
|
204
|
4
|
out <- bnpsd::draw_all_admix(
|
205
|
4
|
admix_proportions = admix_proportions,
|
206
|
4
|
inbr_subpops = inbr_subpops,
|
207
|
4
|
m_loci = total_snps_to_simulate,
|
208
|
|
# NOTE by default p_subpops and p_ind are not returned, but here we will ask for them
|
209
|
4
|
want_p_subpops = TRUE,
|
210
|
4
|
want_p_ind = TRUE
|
211
|
|
)
|
212
|
|
|
213
|
4
|
Xall <- t(out$X) # genotypes are columns, rows are subjects
|
214
|
4
|
cnames <- paste0("X", 1:total_snps_to_simulate)
|
215
|
4
|
colnames(Xall) <- cnames
|
216
|
4
|
rownames(Xall) <- paste0("id", 1:n)
|
217
|
4
|
Xall[1:5,1:5]
|
218
|
4
|
dim(Xall)
|
219
|
4
|
subpops <- ceiling( (1:n)/n*k )
|
220
|
4
|
table(subpops) # got k=10 subpops with 100 individuals each
|
221
|
|
|
222
|
|
###################
|
223
|
|
### CAUSAL LOCI ###
|
224
|
|
###################
|
225
|
|
# browser()
|
226
|
|
# Snps used for kinship
|
227
|
4
|
snps_kinships <- sample(cnames, p_kinship, replace = FALSE)
|
228
|
|
|
229
|
|
# all causal snps are in kinship matrix
|
230
|
4
|
if (percent_causal != 0 ) {
|
231
|
|
# browser()
|
232
|
|
# compute marginal allele frequencies
|
233
|
4
|
p_anc_hat <- colMeans(Xall[,snps_kinships,drop=FALSE], na.rm = TRUE)/2
|
234
|
|
|
235
|
|
# select random SNPs! this performs the magic...
|
236
|
|
# also runs additional checks
|
237
|
4
|
causal_indexes <- select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05)
|
238
|
|
|
239
|
|
# draw random SNP coefficients for selected loci
|
240
|
|
# causal_coeffs <- stats::rnorm(ncausal, 0, 1)
|
241
|
|
|
242
|
4
|
causal <- snps_kinships[causal_indexes]
|
243
|
4
|
snps_design <- c(setdiff(cnames, snps_kinships), causal)
|
244
|
4
|
not_causal <- setdiff(snps_design, causal)
|
245
|
|
|
246
|
|
# subset data to consider causal loci only
|
247
|
4
|
p_anc_hat <- p_anc_hat[causal_indexes]
|
248
|
|
|
249
|
0
|
} else if (percent_causal == 0) {
|
250
|
0
|
causal <- ""
|
251
|
0
|
snps_design <- setdiff(cnames, snps_kinships)
|
252
|
0
|
not_causal <- snps_design
|
253
|
|
}
|
254
|
|
|
255
|
4
|
Xkinship <- Xall[,snps_kinships]
|
256
|
4
|
Xdesign <- Xall[,snps_design]
|
257
|
|
|
258
|
|
# Xdesign_causal <- Xall[,causal_indexes,drop=F] # the subset of causal data (keep as a matrix even if m_causal == 1)
|
259
|
|
|
260
|
|
# now estimate kinship using popkin
|
261
|
4
|
PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE)
|
262
|
4
|
mean_kinship <- mean(PhiHat)
|
263
|
|
# PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE)
|
264
|
|
|
265
|
0
|
} else if (percent_overlap == "0") {
|
266
|
|
|
267
|
0
|
total_snps_to_simulate <- p_design + p_kinship
|
268
|
|
|
269
|
|
# this contains all SNPs (X_{Testing}:X_{kinship})
|
270
|
|
# out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate)
|
271
|
|
|
272
|
|
# draw all random Allele Freqs (AFs) and genotypes
|
273
|
|
# reuse the previous inbr_subpops, admix_proportions
|
274
|
0
|
out <- bnpsd::draw_all_admix(
|
275
|
0
|
admix_proportions = admix_proportions,
|
276
|
0
|
inbr_subpops = inbr_subpops,
|
277
|
0
|
m_loci = total_snps_to_simulate,
|
278
|
|
# NOTE by default p_subpops and p_ind are not returned, but here we will ask for them
|
279
|
0
|
want_p_subpops = TRUE,
|
280
|
0
|
want_p_ind = TRUE
|
281
|
|
)
|
282
|
|
|
283
|
|
|
284
|
0
|
Xall <- t(out$X) # genotypes are columns, rows are subjects
|
285
|
0
|
cnames <- paste0("X", 1:total_snps_to_simulate)
|
286
|
0
|
colnames(Xall) <- cnames
|
287
|
0
|
rownames(Xall) <- paste0("id", 1:n)
|
288
|
0
|
Xall[1:5,1:5]
|
289
|
0
|
dim(Xall)
|
290
|
0
|
subpops <- ceiling( (1:n)/n*k )
|
291
|
0
|
table(subpops) # got k=10 subpops with 100 individuals each
|
292
|
|
|
293
|
|
|
294
|
|
# Snps used for kinship
|
295
|
0
|
snps_kinships <- sample(cnames, p_kinship, replace = FALSE)
|
296
|
0
|
length(snps_kinships)
|
297
|
|
|
298
|
0
|
snps_design <- setdiff(cnames, snps_kinships)
|
299
|
|
# length(snps_design)
|
300
|
|
# setdiff(cnames, snps_kinships) %>% length()
|
301
|
0
|
if (percent_causal !=0) {
|
302
|
|
|
303
|
|
# compute marginal allele frequencies
|
304
|
0
|
p_anc_hat <- colMeans(Xall[,snps_design, drop=FALSE], na.rm = TRUE)/2
|
305
|
|
|
306
|
|
# select random SNPs! this performs the magic...
|
307
|
|
# also runs additional checks
|
308
|
0
|
causal_indexes <- select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05)
|
309
|
|
|
310
|
|
# draw random SNP coefficients for selected loci
|
311
|
|
# causal_coeffs <- stats::rnorm(ncausal, 0, 1)
|
312
|
|
|
313
|
0
|
causal <- snps_design[causal_indexes]
|
314
|
|
# causal <- sample(snps_design, ncausal, replace = FALSE)
|
315
|
0
|
} else if (percent_causal == 0) {
|
316
|
0
|
causal <- ""
|
317
|
|
}
|
318
|
|
|
319
|
0
|
not_causal <- setdiff(snps_design, causal)
|
320
|
|
|
321
|
0
|
Xkinship <- Xall[,snps_kinships]
|
322
|
0
|
Xdesign <- Xall[,snps_design]
|
323
|
|
|
324
|
|
# now estimate kinship using popkin
|
325
|
0
|
PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE)
|
326
|
|
# PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE)
|
327
|
|
|
328
|
|
}
|
329
|
|
|
330
|
4
|
np <- dim(Xdesign)
|
331
|
4
|
n <- np[[1]]
|
332
|
4
|
p <- np[[2]]
|
333
|
|
|
334
|
4
|
beta <- rep(0, length = p)
|
335
|
4
|
if (percent_causal != 0) {
|
336
|
4
|
beta[which(colnames(Xdesign) %in% causal)] <- stats::rnorm(n = length(causal))
|
337
|
|
}
|
338
|
|
|
339
|
4
|
mu <- as.numeric(Xdesign %*% beta)
|
340
|
|
|
341
|
|
# y <- trait
|
342
|
4
|
kin <- 2 * PhiHat
|
343
|
|
|
344
|
4
|
tt <- eta * sigma2 * kin
|
345
|
4
|
if (!all(eigen(tt)$values > 0)) {
|
346
|
4
|
message("eta * sigma2 * kin not PD, using Matrix::nearPD")
|
347
|
4
|
tt <- Matrix::nearPD(tt)$mat
|
348
|
|
}
|
349
|
|
|
350
|
4
|
P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt)
|
351
|
4
|
E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n))
|
352
|
|
# y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim)
|
353
|
|
# y <- b0 + mu + t(P) + t(E)
|
354
|
|
# y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n))
|
355
|
4
|
y <- b0 + mu + P + E
|
356
|
|
|
357
|
|
# partition the data into train/tune/test
|
358
|
4
|
spec = c(train = train_tune_test[1], tune = train_tune_test[2], test = train_tune_test[3])
|
359
|
|
|
360
|
4
|
g = sample(cut(
|
361
|
4
|
seq(nrow(Xdesign)),
|
362
|
4
|
nrow(Xdesign)*cumsum(c(0,spec)),
|
363
|
4
|
labels = names(spec)
|
364
|
|
))
|
365
|
|
# g %>% table
|
366
|
|
|
367
|
4
|
train_ind <- which(g == "train")
|
368
|
4
|
tune_ind <- which(g == "tune")
|
369
|
4
|
test_ind <- which(g == "test")
|
370
|
|
# res = split(admixed$x, g)
|
371
|
|
|
372
|
4
|
xtrain <- Xdesign[train_ind,,drop=FALSE]
|
373
|
4
|
xtune <- Xdesign[tune_ind,,drop=FALSE]
|
374
|
4
|
xtest <- Xdesign[test_ind,,drop=FALSE]
|
375
|
|
|
376
|
4
|
ytrain <- y[train_ind]
|
377
|
4
|
ytune <- y[tune_ind]
|
378
|
4
|
ytest <- y[test_ind]
|
379
|
|
|
380
|
4
|
kin_train <- kin[train_ind,train_ind]
|
381
|
4
|
kin_tune_train <- kin[tune_ind,train_ind]
|
382
|
4
|
kin_test_train <- kin[test_ind,train_ind]
|
383
|
|
|
384
|
|
# xtrain <- Xdesign[ind,,drop=FALSE]
|
385
|
|
# xtune <- Xdesign[-ind,,drop=FALSE]
|
386
|
4
|
PC_all <- stats::prcomp(Xdesign)
|
387
|
4
|
PC <- stats::prcomp(xtrain)
|
388
|
4
|
xtrain_lasso <- cbind(xtrain, PC$x[,1:nPC])
|
389
|
4
|
xtune_pc <- stats::predict(PC, newdata = xtune)
|
390
|
4
|
xtune_lasso <- cbind(xtune, xtune_pc[,1:nPC])
|
391
|
4
|
xtest_pc <- stats::predict(PC, newdata = xtest)
|
392
|
4
|
xtest_lasso <- cbind(xtest, xtest_pc[,1:nPC])
|
393
|
|
|
394
|
4
|
mu_train <- mu[train_ind]
|
395
|
|
|
396
|
|
|
397
|
4
|
return(list(ytrain = ytrain,
|
398
|
4
|
ytune = ytune,
|
399
|
4
|
ytest = ytest,
|
400
|
|
|
401
|
4
|
xtrain = xtrain,
|
402
|
4
|
xtune = xtune,
|
403
|
4
|
xtest = xtest,
|
404
|
|
|
405
|
4
|
xtrain_lasso = xtrain_lasso,
|
406
|
4
|
xtune_lasso = xtune_lasso,
|
407
|
4
|
xtest_lasso = xtest_lasso,
|
408
|
|
|
409
|
4
|
Xkinship = Xall[train_ind,snps_kinships, drop = F],
|
410
|
4
|
kin_train = kin_train,
|
411
|
4
|
kin_tune_train = kin_tune_train,
|
412
|
4
|
kin_test_train = kin_test_train, # covaraince between train and test
|
413
|
|
|
414
|
4
|
mu_train = mu_train, # Xbeta for training set
|
415
|
|
|
416
|
4
|
causal = causal,
|
417
|
4
|
beta = beta,
|
418
|
|
|
419
|
4
|
not_causal = not_causal,
|
420
|
|
|
421
|
|
# used in manuscript to generate simulation study figures
|
422
|
4
|
kinship = kinship,
|
423
|
4
|
coancestry = coancestry,
|
424
|
4
|
PC = PC_all$x[,1:nPC],
|
425
|
4
|
subpops = subpops
|
426
|
|
))
|
427
|
|
}
|
428
|
|
|
429
|
|
|
430
|
0
|
l2norm <- function(x) sqrt(sum(x^2))
|
431
|
|
|
432
|
|
|
433
|
|
"%ni%" <- Negate("%in%")
|
434
|
|
|
435
|
|
# internal function
|
436
|
|
# taken verbatim from glmnet package.
|
437
|
|
# it used to be exported by glmnet, but is no longer exported.
|
438
|
|
lambda.interp <- function (lambda, s) {
|
439
|
4
|
if (length(lambda) == 1) {
|
440
|
0
|
nums = length(s)
|
441
|
0
|
left = rep(1, nums)
|
442
|
0
|
right = left
|
443
|
0
|
sfrac = rep(1, nums)
|
444
|
|
}
|
445
|
|
else {
|
446
|
4
|
s[s > max(lambda)] = max(lambda)
|
447
|
4
|
s[s < min(lambda)] = min(lambda)
|
448
|
4
|
k = length(lambda)
|
449
|
4
|
sfrac <- (lambda[1] - s)/(lambda[1] - lambda[k])
|
450
|
4
|
lambda <- (lambda[1] - lambda)/(lambda[1] - lambda[k])
|
451
|
4
|
coord <- stats::approx(lambda, seq(lambda), sfrac)$y
|
452
|
4
|
left <- floor(coord)
|
453
|
4
|
right <- ceiling(coord)
|
454
|
4
|
sfrac = (sfrac - lambda[right])/(lambda[left] - lambda[right])
|
455
|
4
|
sfrac[left == right] = 1
|
456
|
4
|
sfrac[abs(lambda[left] - lambda[right]) < .Machine$double.eps] = 1
|
457
|
|
}
|
458
|
4
|
list(left = left, right = right, frac = sfrac)
|
459
|
|
}
|
460
|
|
|
461
|
|
|
462
|
|
|
463
|
|
# internal function
|
464
|
|
# taken verbatim from glmnet package.
|
465
|
|
# it used to be exported by glmnet, but is no longer exported.
|
466
|
|
nonzeroCoef <- function (beta, bystep = FALSE) {
|
467
|
4
|
nr = nrow(beta)
|
468
|
4
|
if (nr == 1) {
|
469
|
0
|
if (bystep)
|
470
|
0
|
apply(beta, 2, function(x) if (abs(x) > 0)
|
471
|
0
|
1
|
472
|
0
|
else NULL)
|
473
|
|
else {
|
474
|
0
|
if (any(abs(beta) > 0))
|
475
|
0
|
1
|
476
|
0
|
else NULL
|
477
|
|
}
|
478
|
|
}
|
479
|
|
else {
|
480
|
4
|
beta = abs(beta) > 0
|
481
|
4
|
which = seq(nr)
|
482
|
4
|
ones = rep(1, ncol(beta))
|
483
|
4
|
nz = as.vector((beta %*% ones) > 0)
|
484
|
4
|
which = which[nz]
|
485
|
4
|
if (bystep) {
|
486
|
4
|
if (length(which) > 0) {
|
487
|
4
|
beta = as.matrix(beta[which, , drop = FALSE])
|
488
|
4
|
nzel = function(x, which) if (any(x))
|
489
|
4
|
which[x]
|
490
|
4
|
else NULL
|
491
|
4
|
which = apply(beta, 2, nzel, which)
|
492
|
4
|
if (!is.list(which))
|
493
|
4
|
which = data.frame(which)
|
494
|
4
|
which
|
495
|
|
}
|
496
|
|
else {
|
497
|
0
|
dn = dimnames(beta)[[2]]
|
498
|
0
|
which = vector("list", length(dn))
|
499
|
0
|
names(which) = dn
|
500
|
0
|
which
|
501
|
|
}
|
502
|
|
}
|
503
|
4
|
else which
|
504
|
|
}
|
505
|
|
}
|
506
|
|
|
507
|
|
|
508
|
|
|
509
|
|
# internal function
|
510
|
|
# taken verbatim from https://github.com/OchoaLab/simtrait/blob/master/R/select_loci.R
|
511
|
|
select_loci <- function(maf, m_causal, maf_cut = 0.05) {
|
512
|
|
# check for missing parameters
|
513
|
4
|
if (missing(maf))
|
514
|
0
|
stop('marginal allele frequency vector `maf` is required!')
|
515
|
4
|
if (missing(m_causal))
|
516
|
0
|
stop('the number of causal loci `m_causal` is required!')
|
517
|
|
|
518
|
|
# data dimensions
|
519
|
4
|
m <- length(maf)
|
520
|
|
# other checks
|
521
|
4
|
if (m_causal > m)
|
522
|
0
|
stop('the number of causal loci cannot be larger than the total number of loci (', m_causal, ' > ', m, ')')
|
523
|
|
|
524
|
|
# select random loci!
|
525
|
|
# we might not want to pick extremely rare alleles, so set MAF thresholds
|
526
|
4
|
i <- which(maf_cut <= maf & maf <= 1 - maf_cut) # candidate locus indexes
|
527
|
4
|
sample(i, m_causal) # these are the chosen locus indeces!
|
528
|
|
}
|