1
|
|
#' Fit qpAdm models based on the rotation strategy described in
|
2
|
|
#' Harney et al. 2020 (bioRxiv)
|
3
|
|
#'
|
4
|
|
#' @param data EIGENSTRAT dataset
|
5
|
|
#' @param target Target population that is modeled as admixed
|
6
|
|
#' @param candidates Potential candidates for sources and outgroups
|
7
|
|
#' @param minimize Test also all possible subsets of outgroups? (default TRUE)
|
8
|
|
#' @param nsources Number of sources to pull from the candidates
|
9
|
|
#' @param ncores Number of CPU cores to utilize for model fitting
|
10
|
|
#' @param fulloutput Report also 'ranks' and 'subsets' analysis from
|
11
|
|
#' qpAdm in addition to the admixture proportions results? (default FALSE)
|
12
|
|
#'
|
13
|
|
#' @return qpAdm list with proportions, ranks and subsets elements (as
|
14
|
|
#' with a traditional qpAdm run) or just the proportions
|
15
|
|
#' (determined by the value of the 'fulloutput' argument)
|
16
|
|
#'
|
17
|
|
#' @examples
|
18
|
|
#' \dontrun{# download an example genomic data set and prepare it for analysis
|
19
|
|
#' snps <- eigenstrat(download_data(dirname = tempdir()))
|
20
|
|
#'
|
21
|
|
#' # find the set of most likely two-source qpAdm models of
|
22
|
|
#' # a French individual - produce only the 'proportions'
|
23
|
|
#' # qpAdm summary
|
24
|
|
#' models <- qpAdm_rotation(
|
25
|
|
#' data = snps,
|
26
|
|
#' target = "French",
|
27
|
|
#' candidates = c("Dinka", "Mbuti", "Yoruba", "Vindija",
|
28
|
|
#' "Altai", "Denisova", "Chimp"),
|
29
|
|
#' minimize = TRUE,
|
30
|
|
#' nsources = 2,
|
31
|
|
#' ncores = 2,
|
32
|
|
#' fulloutput = FALSE
|
33
|
|
#' )
|
34
|
|
#' }
|
35
|
|
#'
|
36
|
|
#' @importFrom utils combn
|
37
|
|
#' @export
|
38
|
|
qpAdm_rotation <- function(data, target, candidates, minimize = TRUE, nsources = 2, ncores = 1, fulloutput = FALSE) {
|
39
|
1
|
check_type(data, "EIGENSTRAT")
|
40
|
|
|
41
|
|
## generate combinations of possible sources and outgroups
|
42
|
1
|
sources <- t(combn(candidates, nsources))
|
43
|
1
|
sources_outgroups <- unlist(lapply(1:nrow(sources), function(i) {
|
44
|
1
|
outgroups <- setdiff(candidates, sources[i, ])
|
45
|
1
|
if (minimize) {
|
46
|
0
|
outgroups <- unlist(lapply((nsources + 1):length(outgroups), function(nout) {
|
47
|
0
|
outcomb <- t(combn(outgroups, nout))
|
48
|
0
|
lapply(1:nrow(outcomb), function(j) outcomb[j, ])
|
49
|
0
|
}), recursive = FALSE)
|
50
|
|
} else {
|
51
|
1
|
outgroups <- list(outgroups)
|
52
|
|
}
|
53
|
|
|
54
|
1
|
lapply(outgroups, function(out) { list(sources = sources[i, ], outgroups = out) })
|
55
|
1
|
}), recursive = FALSE)
|
56
|
|
|
57
|
|
## run qpAdm for all combinations of sources and outgroups
|
58
|
1
|
results_list <- parallel::mclapply(sources_outgroups, function(x) {
|
59
|
1
|
result <- qpAdm(
|
60
|
1
|
data,
|
61
|
1
|
target = target, sources = x$sources, outgroups = x$outgroups
|
62
|
|
)
|
63
|
|
|
64
|
1
|
names(x$sources) <- paste0("source", 1:nsources)
|
65
|
1
|
sources_df <- as.data.frame(t(as.matrix(x$sources)))
|
66
|
|
|
67
|
|
## rename sources and stderr columns (by default, proportion and
|
68
|
|
## stderr columns are named based on the source populations - we
|
69
|
|
## don't want that here because we want to merge all the individual
|
70
|
|
## proportion tables, columns have to have the same name)
|
71
|
1
|
names(result$proportions)[2:(1 + nsources)] <- paste0("prop", 1:nsources)
|
72
|
1
|
names(result$proportions)[4:(3 + nsources)] <- paste0("stderr", 1:nsources)
|
73
|
|
## add source names as two new columns
|
74
|
1
|
result$proportions <- cbind(result$proportions, sources_df) %>%
|
75
|
1
|
dplyr::mutate(outgroups = paste0(x$outgroups, collapse = " & "),
|
76
|
1
|
noutgroups = length(x$outgroups))
|
77
|
|
## rearrange columns
|
78
|
1
|
result$proportions <- dplyr::select(
|
79
|
1
|
result$proportions, target, names(x$sources), outgroups, noutgroups, pvalue,
|
80
|
1
|
dplyr::everything()
|
81
|
|
)
|
82
|
|
|
83
|
|
## reformat rank table
|
84
|
1
|
names(result$subsets)[7:(6 + nsources)] <- paste0("prop", 1:nsources)
|
85
|
|
## add source names as two new columns
|
86
|
1
|
result$subsets <- cbind(result$subsets, sources_df)
|
87
|
1
|
result$subsets <- dplyr::select(
|
88
|
1
|
result$subsets, target, names(x$sources), pattern,
|
89
|
1
|
dplyr::everything()
|
90
|
|
)
|
91
|
|
|
92
|
1
|
result
|
93
|
1
|
}, mc.cores = ncores)
|
94
|
|
|
95
|
|
## extract log information before further processing of the results
|
96
|
1
|
log_lines <- sapply(results_list, function(i) attr(i, "log_output"))
|
97
|
1
|
names(log_lines) <- paste0("m", seq_along(sources_outgroups))
|
98
|
|
|
99
|
1
|
proportions <- dplyr::bind_rows(lapply(results_list, `[[`, "proportions"))
|
100
|
1
|
ranks <- dplyr::bind_rows(lapply(results_list, `[[`, "ranks"))
|
101
|
1
|
subsets <- dplyr::bind_rows(lapply(results_list, `[[`, "subsets"))
|
102
|
|
|
103
|
|
## add model identifier to each row in the proportions table, ...
|
104
|
1
|
models <- paste0("m", seq_along(sources_outgroups))
|
105
|
1
|
proportions$model <- models
|
106
|
1
|
proportions <- dplyr::as_tibble(proportions) %>% dplyr::select(model, dplyr::everything())
|
107
|
|
## ... ranks table, ...
|
108
|
1
|
ranks$model <- sort(rep(models, 2))
|
109
|
1
|
ranks <- dplyr::as_tibble(ranks) %>% dplyr::select(model, dplyr::everything())
|
110
|
|
## and subsets table
|
111
|
1
|
subsets$model <- sort(rep(models, 1 + 2^(nsources - 1)))
|
112
|
1
|
subsets <- dplyr::as_tibble(subsets) %>% dplyr::select(model, dplyr::everything())
|
113
|
|
|
114
|
|
## add metadata to the results object
|
115
|
1
|
if (fulloutput)
|
116
|
1
|
results <- list(proportions = proportions, ranks = ranks, subsets = subsets)
|
117
|
|
else
|
118
|
1
|
results <- proportions
|
119
|
|
|
120
|
1
|
attr(results, "command") <- "qpAdm_rotation"
|
121
|
1
|
attr(results, "log_output") <- log_lines
|
122
|
1
|
class(results) <- c("admixr_result", class(results))
|
123
|
|
|
124
|
1
|
results
|
125
|
|
}
|
126
|
|
|
127
|
|
|
128
|
|
#' Filter qpAdm rotation results for only 'sensible' models
|
129
|
|
#'
|
130
|
|
#' Filter for p-value larger than a specified cuttof and admixture
|
131
|
|
#' proportions between 0 and 1.
|
132
|
|
#'
|
133
|
|
#' @param x Output of a qpAdm_rotation() function
|
134
|
|
#' @param p p-value cutoff (default 0: will only filter for sensible
|
135
|
|
#' admixture proportions)
|
136
|
|
#'
|
137
|
|
#' @return qpAdm_rotation object filtered down based on p-value
|
138
|
|
#'
|
139
|
|
#'
|
140
|
|
#'
|
141
|
|
#' @examples
|
142
|
|
#' \dontrun{# download an example genomic data set and prepare it for analysis
|
143
|
|
#' snps <- eigenstrat(download_data(dirname = tempdir()))
|
144
|
|
#'
|
145
|
|
#' # find the set of most likely two-source qpAdm models of
|
146
|
|
#' # a French individual - produce only the 'proportions'
|
147
|
|
#' # qpAdm summary
|
148
|
|
#' models <- qpAdm_rotation(
|
149
|
|
#' data = snps,
|
150
|
|
#' target = "French",
|
151
|
|
#' candidates = c("Dinka", "Mbuti", "Yoruba", "Vindija",
|
152
|
|
#' "Altai", "Denisova", "Chimp"),
|
153
|
|
#' minimize = TRUE,
|
154
|
|
#' nsources = 2,
|
155
|
|
#' ncores = 2,
|
156
|
|
#' fulloutput = FALSE
|
157
|
|
#' )
|
158
|
|
#'
|
159
|
|
#' # filter out models which can clearly be rejected
|
160
|
|
#' fits <- qpAdm_filter(models, p = 0.05)
|
161
|
|
#' }
|
162
|
|
#'
|
163
|
|
#' @export
|
164
|
|
qpAdm_filter <- function(x, p = 0.05) {
|
165
|
1
|
check_type(x, "admixr_result")
|
166
|
1
|
if (attr(x, "command") != "qpAdm_rotation") {
|
167
|
0
|
stop("Filtering implemented only for results of the qpAdm rotation procedure",
|
168
|
0
|
call. = FALSE)
|
169
|
|
}
|
170
|
|
|
171
|
1
|
if (length(x) == 3)
|
172
|
1
|
proportions <- x$proportions
|
173
|
|
else
|
174
|
0
|
proportions <- x
|
175
|
|
|
176
|
|
## get positions of columns with estimated admixture proportions
|
177
|
1
|
prop_columns <- stringr::str_which(names(proportions), "prop")
|
178
|
|
|
179
|
|
## find out rows/models for which all proportions are in [0, 1] and
|
180
|
|
## pvalue is larger than the required cutoff
|
181
|
1
|
pvalue <- proportions$pvalue > p
|
182
|
1
|
constr_0 <- proportions[, prop_columns] >= 0
|
183
|
1
|
constr_1 <- proportions[, prop_columns] <= 1
|
184
|
1
|
constr <- apply(pvalue & constr_0 & constr_1, 1, all)
|
185
|
|
|
186
|
|
## filter all three sub-tables to only those models that fit the criteria
|
187
|
1
|
proportions <- dplyr::arrange(proportions[constr, ], -pvalue)
|
188
|
|
|
189
|
1
|
if (length(x) == 3) {
|
190
|
1
|
x$proportions <- proportions
|
191
|
1
|
x$ranks <- x$ranks[x$ranks$model %in% proportions$model, ]
|
192
|
1
|
x$subsets <- x$subsets[x$subsets$model %in% proportions$model, ]
|
193
|
|
## filter also only to relevant remaining log output information
|
194
|
1
|
attr(x, "log_output") <- attr(x, "log_output")[unique(proportions$model)]
|
195
|
1
|
return(x)
|
196
|
|
} else {
|
197
|
0
|
attr(proportions, "log_output") <- attr(x, "log_output")[unique(proportions$model)]
|
198
|
0
|
return(proportions)
|
199
|
|
}
|
200
|
|
}
|