1
#' Fit qpAdm models based on the rotation strategy described in
2
#' Harney et al. 2020 (bioRxiv)
3
#'
4
#' @param data EIGENSTRAT dataset
5
#' @param target Target population that is modeled as admixed
6
#' @param candidates Potential candidates for sources and outgroups
7
#' @param minimize Test also all possible subsets of outgroups? (default TRUE)
8
#' @param nsources Number of sources to pull from the candidates
9
#' @param ncores Number of CPU cores to utilize for model fitting
10
#' @param fulloutput Report also 'ranks' and 'subsets' analysis from
11
#'     qpAdm in addition to the admixture proportions results? (default FALSE)
12
#'
13
#' @return qpAdm list with proportions, ranks and subsets elements (as
14
#'     with a traditional qpAdm run) or just the proportions
15
#'     (determined by the value of the 'fulloutput' argument)
16
#'
17
#' @examples
18
#' \dontrun{# download an example genomic data set and prepare it for analysis
19
#' snps <- eigenstrat(download_data(dirname = tempdir()))
20
#'
21
#' # find the set of most likely two-source qpAdm models of
22
#' # a French individual - produce only the 'proportions'
23
#' # qpAdm summary
24
#' models <- qpAdm_rotation(
25
#'     data = snps,
26
#'     target = "French",
27
#'     candidates = c("Dinka", "Mbuti", "Yoruba", "Vindija",
28
#'                    "Altai", "Denisova", "Chimp"),
29
#'     minimize = TRUE,
30
#'     nsources = 2,
31
#'     ncores = 2,
32
#'     fulloutput = FALSE
33
#' )
34
#' }
35
#'
36
#' @importFrom utils combn
37
#' @export
38
qpAdm_rotation <- function(data, target, candidates, minimize = TRUE, nsources = 2, ncores = 1, fulloutput = FALSE) {
39 1
    check_type(data, "EIGENSTRAT")
40

41
    ## generate combinations of possible sources and outgroups
42 1
    sources <- t(combn(candidates, nsources))
43 1
    sources_outgroups <- unlist(lapply(1:nrow(sources), function(i) {
44 1
        outgroups <- setdiff(candidates, sources[i, ])
45 1
        if (minimize) {
46 0
            outgroups <- unlist(lapply((nsources + 1):length(outgroups), function(nout) {
47 0
                outcomb <- t(combn(outgroups, nout))
48 0
                lapply(1:nrow(outcomb), function(j) outcomb[j, ])
49 0
            }), recursive = FALSE)
50
        } else {
51 1
            outgroups <- list(outgroups)
52
        }
53

54 1
        lapply(outgroups, function(out) { list(sources = sources[i, ], outgroups = out) })
55 1
    }), recursive = FALSE)
56

57
    ## run qpAdm for all combinations of sources and outgroups
58 1
    results_list <- parallel::mclapply(sources_outgroups, function(x) {
59 1
        result <- qpAdm(
60 1
            data,
61 1
            target = target, sources = x$sources, outgroups = x$outgroups
62
        )
63

64 1
        names(x$sources) <- paste0("source", 1:nsources)
65 1
        sources_df <- as.data.frame(t(as.matrix(x$sources)))
66

67
        ## rename sources and stderr columns (by default, proportion and
68
        ## stderr columns are named based on the source populations - we
69
        ## don't want that here because we want to merge all the individual
70
        ## proportion tables, columns have to have the same name)
71 1
        names(result$proportions)[2:(1 + nsources)] <- paste0("prop", 1:nsources)
72 1
        names(result$proportions)[4:(3 + nsources)] <- paste0("stderr", 1:nsources)
73
        ## add source names as two new columns
74 1
        result$proportions <- cbind(result$proportions, sources_df) %>%
75 1
            dplyr::mutate(outgroups = paste0(x$outgroups, collapse = " & "),
76 1
                          noutgroups = length(x$outgroups))
77
        ## rearrange columns
78 1
        result$proportions <- dplyr::select(
79 1
            result$proportions, target, names(x$sources), outgroups, noutgroups, pvalue,
80 1
            dplyr::everything()
81
        )
82

83
        ## reformat rank table
84 1
        names(result$subsets)[7:(6 + nsources)] <- paste0("prop", 1:nsources)
85
        ## add source names as two new columns
86 1
        result$subsets <- cbind(result$subsets, sources_df)
87 1
        result$subsets <- dplyr::select(
88 1
            result$subsets, target, names(x$sources), pattern,
89 1
            dplyr::everything()
90
        )
91

92 1
        result
93 1
    }, mc.cores = ncores)
94
    
95
    ## extract log information before further processing of the results
96 1
    log_lines <- sapply(results_list, function(i) attr(i, "log_output"))
97 1
    names(log_lines) <- paste0("m", seq_along(sources_outgroups))
98

99 1
    proportions <- dplyr::bind_rows(lapply(results_list, `[[`, "proportions"))
100 1
    ranks <- dplyr::bind_rows(lapply(results_list, `[[`, "ranks"))
101 1
    subsets <- dplyr::bind_rows(lapply(results_list, `[[`, "subsets"))
102

103
    ## add model identifier to each row in the proportions table, ...
104 1
    models <- paste0("m", seq_along(sources_outgroups))
105 1
    proportions$model <- models
106 1
    proportions <- dplyr::as_tibble(proportions) %>% dplyr::select(model, dplyr::everything())
107
    ## ... ranks table, ...
108 1
    ranks$model <- sort(rep(models, 2))
109 1
    ranks <- dplyr::as_tibble(ranks) %>% dplyr::select(model, dplyr::everything())
110
    ## and subsets table
111 1
    subsets$model <- sort(rep(models, 1 + 2^(nsources - 1)))
112 1
    subsets <- dplyr::as_tibble(subsets) %>% dplyr::select(model, dplyr::everything())
113

114
    ## add metadata to the results object
115 1
    if (fulloutput)
116 1
        results <- list(proportions = proportions, ranks = ranks, subsets = subsets)
117
    else
118 1
        results <- proportions
119
        
120 1
    attr(results, "command") <- "qpAdm_rotation"
121 1
    attr(results, "log_output") <- log_lines
122 1
    class(results) <- c("admixr_result", class(results))
123

124 1
    results
125
}
126

127

128
#' Filter qpAdm rotation results for only 'sensible' models
129
#'
130
#' Filter for p-value larger than a specified cuttof and admixture
131
#' proportions between 0 and 1.
132
#'
133
#' @param x Output of a qpAdm_rotation() function
134
#' @param p p-value cutoff (default 0: will only filter for sensible
135
#'     admixture proportions)
136
#'
137
#' @return qpAdm_rotation object filtered down based on p-value
138
#'
139
#' 
140
#'
141
#' @examples
142
#' \dontrun{# download an example genomic data set and prepare it for analysis
143
#' snps <- eigenstrat(download_data(dirname = tempdir()))
144
#'
145
#' # find the set of most likely two-source qpAdm models of
146
#' # a French individual - produce only the 'proportions'
147
#' # qpAdm summary
148
#' models <- qpAdm_rotation(
149
#'     data = snps,
150
#'     target = "French",
151
#'     candidates = c("Dinka", "Mbuti", "Yoruba", "Vindija",
152
#'                    "Altai", "Denisova", "Chimp"),
153
#'     minimize = TRUE,
154
#'     nsources = 2,
155
#'     ncores = 2,
156
#'     fulloutput = FALSE
157
#' )
158
#'
159
#' # filter out models which can clearly be rejected
160
#' fits <- qpAdm_filter(models, p = 0.05)
161
#' }
162
#'
163
#' @export
164
qpAdm_filter <- function(x, p = 0.05) {
165 1
    check_type(x, "admixr_result")
166 1
    if (attr(x, "command") != "qpAdm_rotation") {
167 0
        stop("Filtering implemented only for results of the qpAdm rotation procedure",
168 0
             call. = FALSE)
169
    }
170

171 1
    if (length(x) == 3)
172 1
        proportions <- x$proportions
173
    else
174 0
        proportions <- x
175

176
    ## get positions of columns with estimated admixture proportions
177 1
    prop_columns <- stringr::str_which(names(proportions), "prop")
178

179
    ## find out rows/models for which all proportions are in [0, 1] and
180
    ## pvalue is larger than the required cutoff
181 1
    pvalue <- proportions$pvalue > p
182 1
    constr_0 <- proportions[, prop_columns] >= 0
183 1
    constr_1 <- proportions[, prop_columns] <= 1
184 1
    constr <- apply(pvalue & constr_0 & constr_1, 1, all)
185

186
    ## filter all three sub-tables to only those models that fit the criteria
187 1
    proportions <- dplyr::arrange(proportions[constr, ], -pvalue)
188

189 1
    if (length(x) == 3) {
190 1
        x$proportions <- proportions
191 1
        x$ranks <- x$ranks[x$ranks$model %in% proportions$model, ]
192 1
        x$subsets <- x$subsets[x$subsets$model %in% proportions$model, ]
193
        ## filter also only to relevant remaining log output information
194 1
        attr(x, "log_output") <- attr(x, "log_output")[unique(proportions$model)]
195 1
        return(x)
196
    } else {
197 0
        attr(proportions, "log_output") <- attr(x, "log_output")[unique(proportions$model)]
198 0
        return(proportions)
199
    }
200
}

Read our documentation on viewing source code .

Loading