Compare 0dd1c2b ... +2 ... 0aca64d

Coverage Reach
normalisation.R

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.

Showing 1 of 7 files from the diff.

@@ -1,341 +1,13 @@
Loading
1 -
# lsei import is for scran
2 -
3 -
#' State-of-the-art preprocessing and normalisation
4 -
#'
5 -
#' Based on \url{https://f1000research.com/articles/5-2122/v2} and \url{https://www.bioconductor.org/help/workflows/simpleSingleCell/}.
1 +
#' Normalisation
6 2
#'
7 3
#' @param counts The counts matrix, with features in columns
8 -
#' @param filter_cells Whether the cells have to be filtered
9 -
#' @param filter_features Whether the features have to be filtered
10 -
#' @param filter_hvg Whether to filter on highly variable features
11 -
#' @param normalisation How to normalise
12 -
#' @param has_spike Does this contain spike-ins, for which the feature names are preseded by ERCC
13 -
#' @param verbose Whether to add plots
14 -
#' @param nmads Number of median deviations for filtering outlier cells
15 -
#' @param min_ave_expression Minimal average expression of a feature
16 -
#' @param hvg_fdr FDR feature filtering cutoff
17 -
#' @param hvg_bio Biological feature filtering cutoff
18 -
#' @param min_variable_fraction Minimal number of variable features to retain
19 -
#'
20 -
#' @importFrom Matrix t rowMeans
21 -
#' @importFrom scater calculateQCMetrics isOutlier normalize
22 -
#' @importFrom SingleCellExperiment isSpike SingleCellExperiment counts sizeFactors logcounts
23 -
#' @importFrom scran computeSumFactors computeSpikeFactors trendVar decomposeVar
24 -
#' @importFrom limSolve lsei
25 -
#' @importFrom stats sd
26 -
#' @importFrom aroma.light aroma.light
27 -
#' @importFrom reshape2 melt
4 +
#' @import Matrix
28 5
#' @export
29 6
normalise_filter_counts <- function(
30 -
  counts,
31 -
  filter_cells = TRUE,
32 -
  filter_features = TRUE,
33 -
  filter_hvg = TRUE,
34 -
  normalisation = "scran_size_factors",
35 -
  has_spike = any(grepl("^ERCC", colnames(counts))),
36 -
  verbose = FALSE,
37 -
  nmads = 3,
38 -
  min_ave_expression = 0.02,
39 -
  hvg_fdr = 0.05,
40 -
  hvg_bio = 0.5,
41 -
  min_variable_fraction = 0.15
7 +
  counts
42 8
) {
43 -
  if (verbose) {
44 -
    requireNamespace("ggplot2")
45 -
    requireNamespace("KernSmooth")
46 -
  }
47 -
48 -
  normalisation_plots <- list()
49 -
  normalisation_steps <- tibble()
50 -
51 -
  ########################################
52 -
  # Create data object
53 -
  ########################################
54 -
55 -
  # convert to integer
56 -
  counts <- floor(counts)
57 -
58 -
  # filter zero cells
59 -
  counts <- counts[apply(counts, 1, max) > 0, ]
60 -
61 -
  if (nrow(counts) == 0) {
62 -
    stop("All counts are zero!")
63 -
  }
64 -
65 -
  # create object
66 -
  sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = Matrix::t(counts)))
67 -
68 -
  # mitochondrial
69 -
  mitochondrial <- grepl("^(mt|MT|Mt)-", rownames(sce))
70 -
  has_mito <- any(mitochondrial)
71 -
72 -
  # spike ins
73 -
  spike <- grepl("^ERCC", rownames(sce))
74 -
  has_spike <- any(spike)
75 -
76 -
  # calculate qc metrics
77 -
  feature_controls <- list()
78 -
79 -
  if (has_mito) feature_controls$Mt <- mitochondrial
80 -
  if (has_spike) feature_controls$ERCC <- grepl("^ERCC", rownames(sce))
81 -
82 -
  sce <- scater::calculateQCMetrics(
83 -
    sce,
84 -
    feature_controls = feature_controls,
85 -
    compact = TRUE
86 -
  )
87 -
88 -
  if (has_spike) {
89 -
    SingleCellExperiment::isSpike(sce, "ERCC") <- spike
90 -
  }
91 -
92 -
  # plots
93 -
  if (verbose) {
94 -
    normalisation_steps <- tribble(
95 -
      ~type, ~nfeatures, ~ncells,
96 -
      "original", nrow(sce), ncol(sce)
97 -
    )
98 -
    cat("Original: features - ", nrow(sce), " Cells - ", ncol(sce), sep = "")
99 -
100 -
    normalisation_plots$library <-
101 -
      ggplot(as.data.frame(sce$scater_qc$all)) +
102 -
      geom_histogram(aes(total_counts)) +
103 -
      scale_x_continuous(limits = c(0, NA))
104 -
105 -
    if (has_spike) {
106 -
      normalisation_plots$spike <-
107 -
        ggplot(as.data.frame(sce$scater_qc$feature_control_ERCC)) +
108 -
        geom_histogram(aes(pct_counts)) +
109 -
        scale_x_continuous(limits = c(0, 100))
110 -
    }
111 -
112 -
    if (has_mito) {
113 -
      normalisation_plots$mito <-
114 -
        ggplot(as.data.frame(sce$scater_qc$feature_control_Mt)) +
115 -
        geom_histogram(aes(pct_counts)) +
116 -
        scale_x_continuous(limits = c(0, 100))
117 -
    }
118 -
119 -
  }
120 -
121 -
  ########################################
122 -
  # Filter cells
123 -
  ########################################
124 -
125 -
  if (filter_cells) {
126 -
    total_counts <- sce$scater_qc$all$log10_total_counts
127 -
    total_features <- sce$scater_qc$all$log10_total_features_by_counts
128 -
    pct_counts_Mt <- sce$scater_qc$feature_control_Mt$pct_counts
129 -
    pct_counts_ERCC <- sce$scater_qc$feature_control_ERCC$pct_counts
130 -
131 -
    mito_drop <- rep(FALSE, length(total_counts))
132 -
    spike_drop <- rep(FALSE, length(total_counts))
133 -
134 -
    libsize_drop <- scater::isOutlier(total_counts, nmads = nmads, type = "lower", log = TRUE)
135 -
    feature_drop <- scater::isOutlier(total_features, nmads = nmads, type = "lower", log = TRUE)
136 -
    if (has_mito) mito_drop <- scater::isOutlier(pct_counts_Mt, nmads = nmads, type = "higher")
137 -
    if (has_spike) spike_drop <- scater::isOutlier(pct_counts_ERCC, nmads = nmads, type = "higher")
138 -
139 -
    if (verbose) {
140 -
      tibble(sum(mito_drop), sum(spike_drop), sum(libsize_drop), sum(feature_drop)) %>% print()
141 -
    }
142 -
143 -
    sce <- sce[,!(libsize_drop | feature_drop | mito_drop | spike_drop)]
144 -
145 -
    if (verbose) {
146 -
      normalisation_steps <- normalisation_steps %>%
147 -
        add_row(type = "cell_quality_filtering", nfeatures = nrow(sce), ncells = ncol(sce))
148 -
      cat("Cell filter: features - ", nrow(sce), " Cells - ", ncol(sce), sep = "")
149 -
    }
150 -
  }
151 -
152 -
  ########################################
153 -
  # Filter features
154 -
  ########################################
155 -
156 -
  if (filter_features) {
157 -
    ave_counts <- Matrix::rowMeans(BiocGenerics::counts(sce))
158 -
    keep <- ave_counts >= min_ave_expression
159 -
160 -
    sce <- sce[keep,]
161 -
162 -
    if (verbose) {
163 -
      normalisation_plots$ave_counts <- tibble(ave_counts = ave_counts) %>%
164 -
        ggplot() +
165 -
        geom_histogram(aes(ave_counts)) +
166 -
        scale_x_log10() +
167 -
        geom_vline(xintercept = min_ave_expression)
168 -
169 -
      top_features <- ave_counts %>% sort() %>% tail(20) %>% names()
170 -
      counts_top_features <-
171 -
        SingleCellExperiment::counts(sce[top_features]) %>%
172 -
        reshape2::melt(varnames = c("feature", "cell"), value.name = "count") %>%
173 -
        dplyr::mutate(feature = factor(feature, levels = top_features))
174 -
175 -
      avecounts_top_features <- counts_top_features %>%
176 -
        group_by(feature) %>%
177 -
        summarise(count = mean(count))
178 -
179 -
      normalisation_plots$top_counts <- counts_top_features %>%
180 -
        ggplot(aes(feature, count + 1)) +
181 -
        geom_point(shape = "|") +
182 -
        geom_point(data = avecounts_top_features, color = "blue", size = 4) +
183 -
        scale_y_log10() +
184 -
        coord_flip()
185 -
186 -
      normalisation_steps <- normalisation_steps %>%
187 -
        add_row(type = "feature_expression_filtering", nfeatures = nrow(sce), ncells = ncol(sce))
188 -
      cat("feature filter: features - ", nrow(sce), " Cells - ", ncol(sce), sep = "")
189 -
    }
190 -
  }
191 -
192 -
  ########################################
193 -
  # Normalise
194 -
  ########################################
195 -
196 -
  if (normalisation == "scran_size_factors") {
197 -
    # fix for small datasets with very low number of cells
198 -
    if (ncol(sce) >= 100) {
199 -
      sizes <- c(20, 40, 60, 80)
200 -
    } else {
201 -
      sizes <- round(ncol(sce) / 5)
202 -
    }
203 -
204 -
    # In some cases, the computation of these sum factors can lead to very strange size factors, which are not at all correlated with the total counts in each cell
205 -
    # If that is the case, we for now fall back to scater::librarySizeFactors
206 -
    sce <- scran::computeSumFactors(sce, sizes = sizes, min.mean = 0.1, positive = TRUE)
207 -
    sce <- sce[, SingleCellExperiment::sizeFactors(sce) > 0] # as mentioned in the scran documentation, ensure that size factors are higher than 0
208 -
209 -
    if(has_spike) {
210 -
      sce <- scran::computeSpikeFactors(sce, type = "ERCC", general.use = FALSE)
211 -
      if(any(is.na(sizeFactors(sce, type = "ERCC")))) {
212 -
        warning("Some cells do not have any spike-ins, this will cause an error further away. Remove spike-ins.")
213 -
      }
214 -
    }
215 -
216 -
    if (cor(sce$scater_qc$all$total_counts, sizeFactors(sce)) < 0.5) {
217 -
      # warning("Low correlation between sumFactors and total_counts, falling back to scater::librarySizeFactors")
218 -
      SingleCellExperiment::sizeFactors(sce) <- scater::librarySizeFactors(sce)
219 -
    }
220 -
221 -
    sce <- scater::normalize(sce)
222 -
223 -
    if (verbose) {
224 -
      normalisation_plots$size_factors <- tibble(
225 -
        size_factor = sizeFactors(sce),
226 -
        library_size = sce$scater_qc$all$total_counts
227 -
      ) %>%
228 -
        ggplot(aes(size_factor, library_size)) +
229 -
        geom_point() +
230 -
        geom_smooth(method = "lm")
231 -
    }
232 -
  } else {
233 -
    stop("Normalisation not supported")
234 -
  }
235 -
236 -
  if (verbose) {
237 -
    normalisation_steps <- normalisation_steps %>%
238 -
      add_row(type = "normalisation", nfeatures = nrow(sce), ncells = ncol(sce))
239 -
    cat("Normalised: features - ", nrow(sce), " Cells - ", ncol(sce), sep = "")
240 -
  }
241 -
242 -
  ########################################
243 -
  # Select highly variable features
244 -
  ########################################
245 -
246 -
  if (filter_hvg) {
247 -
    var_fit <- scran::trendVar(sce, method = "spline", use.spikes = has_spike) # requires aroma.light
248 -
    var_out <- scran::decomposeVar(sce, var_fit) %>% as.data.frame()
249 -
250 -
    if (verbose) {
251 -
      if (has_spike) {
252 -
        var_out$spike <- SingleCellExperiment::isSpike(sce)
253 -
      } else {
254 -
        var_out$spike <- FALSE
255 -
      }
256 -
257 -
      normalisation_plots$variance_mean <- var_out %>%
258 -
        ggplot(aes(mean, total)) +
259 -
          geom_point(aes(color = spike)) +
260 -
          geom_smooth()
261 -
262 -
      normalisation_plots$fdr_bio <- var_out %>%
263 -
        ggplot(aes(FDR, bio)) +
264 -
          geom_point() +
265 -
          geom_hline(yintercept = hvg_bio, color = "red") +
266 -
          geom_vline(xintercept = hvg_fdr, color = "red")
267 -
    }
268 -
269 -
    var_out <- var_out[order(var_out$bio, decreasing = TRUE),]
270 -
    hvg_out <- var_out[which(var_out$FDR <= hvg_fdr & var_out$bio >= hvg_bio),]
271 -
    if(nrow(hvg_out) < min_variable_fraction * ncol(counts)) {
272 -
      n_features <- min(nrow(var_out), ceiling(min_variable_fraction * ncol(counts)))
273 -
274 -
      hvg_out <- var_out[seq(1, n_features), ]
275 -
    }
276 -
277 -
    sce <- sce[rownames(hvg_out),]
278 -
279 -
    if (verbose) {
280 -
      normalisation_steps <- normalisation_steps %>%
281 -
        add_row(type = "feature_variability_filtering", nfeatures = nrow(sce), ncells = ncol(sce))
282 -
      cat("Variable features filtered: features - ", nrow(sce), " Cells - ", ncol(sce), sep = "")
283 -
    }
284 -
  }
285 -
286 -
  expr_norm_filt <- SingleCellExperiment::logcounts(sce) %>% Matrix::t()
287 -
  counts_filt <- counts[rownames(expr_norm_filt),colnames(expr_norm_filt)]
288 -
289 -
  ########################################
290 -
  # Iterative filtering on variability
291 -
  ########################################
292 -
  repeat {
293 -
    feature_sds <- counts_filt %>% apply(2, stats::sd)
294 -
    cell_sds <- counts_filt %>% apply(1, stats::sd)
295 -
296 -
    features_filtered <- which(feature_sds > 0, useNames = TRUE)
297 -
    cells_filtered <- which(cell_sds > 0, useNames = TRUE)
298 -
    expr_norm_filt <- expr_norm_filt[cells_filtered, features_filtered]
299 -
    counts_filt <- counts_filt[cells_filtered, features_filtered]
300 -
301 -
    if((min(c(feature_sds, 1), na.rm = TRUE) > 0) && (min(c(cell_sds, 1), na.rm = TRUE) > 0)){
302 -
      break
303 -
    }
304 -
  }
305 -
306 -
  if (verbose) {
307 -
    normalisation_steps <- normalisation_steps %>%
308 -
      add_row(type = "final_filtering", nfeatures = ncol(expr_norm_filt), ncells = nrow(expr_norm_filt))
309 -
    cat("Final filtering: features - ", ncol(expr_norm_filt), " Cells - ", nrow(expr_norm_filt), sep = "")
310 -
  }
311 -
312 -
  ########################################
313 -
  # Output
314 -
  ########################################
315 -
316 -
  if(verbose) {
317 -
    type <- NULL # satisfy r cmd check
318 -
319 -
    normalisation_plots$n_retained <-
320 -
      normalisation_steps %>%
321 -
      mutate(type = factor(type, levels = rev(type))) %>%
322 -
      gather("dimension", "n", -type) %>%
323 -
      ggplot2::ggplot() +
324 -
      ggplot2::geom_bar(ggplot2::aes_string("type", "n", fill = "dimension"), position = "dodge", stat = "identity") +
325 -
      ggplot2::facet_wrap(~dimension, scales = "free_x") +
326 -
      ggplot2::coord_flip()
327 -
  } else {
328 -
    normalisation_steps <- NULL
329 -
  }
330 -
331 -
  lst(
332 -
    expression = expr_norm_filt,
333 -
    counts = counts_filt,
334 -
    normalisation_plots,
335 -
    info = lst(
336 -
      has_spike,
337 -
      has_mito,
338 -
      normalisation_steps
339 -
    )
9 +
  list(
10 +
    counts = counts,
11 +
    expression = as(log2(counts + 1), "dgCMatrix")
340 12
  )
341 13
}

Everything is accounted for!

No changes detected that need to be reviewed.
What changes does Codecov check for?
Lines, not adjusted in diff, that have changed coverage data.
Files that introduced coverage data that had none before.
Files that have missing coverage data that once were tracked.
Files Coverage
R/normalisation.R +2.91% 100.00%
Project Totals (1 files) 100.00%
Loading