TommyJones / mvrsquared

Compare 76ba5f5 ... +18 ... 6c91020

Coverage Reach

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -1,47 +1,68 @@
Loading
1 -
1 +
// [[Rcpp::plugins(cpp11)]]
2 +
// [[Rcpp::depends(RcppThread)]]
2 3
// [[Rcpp::depends(RcppArmadillo)]]
4 +
#include "RcppThread.h"
3 5
#include <RcppArmadillo.h>
4 6
#include <cmath>
5 7
#define ARMA_64BIT_WORD
6 8
using namespace Rcpp ;
7 9
8 10
// [[Rcpp::export]]
9 -
NumericVector calc_sum_squares_latent(arma::sp_mat Y, NumericMatrix X, NumericMatrix W, NumericVector ybar) {
11 +
NumericVector calc_sum_squares_latent(
12 +
    arma::sp_mat Y,
13 +
    arma::mat X,
14 +
    arma::mat W,
15 +
    arma::vec ybar,
16 +
    int threads
17 +
) {
18 +
19 +
  Y = Y.t(); // transpose Y to take advantage of column major for parallelism
10 20
11 -
  int n_obs = Y.n_rows; // number of observations
12 -
  int n_latent_vars = X.ncol(); // number of latent variables
13 -
  int n_explicit_vars = W.ncol(); // number of explicit variables
21 +
  int n_obs = Y.n_cols; // number of observations
14 22
  NumericVector result(2); // final result
15 23
  double SSE = 0; // sum of squared errors across all documents
16 24
  double SST = 0; // total sum of squares across all documents
17 25
18 26
27 +
  // convert R equivalent classes to arma classes
28 +
29 +
19 30
  // for each observations...
20 -
  for(int d = 0; d < n_obs; d++){
31 +
  RcppThread::parallelFor(
32 +
    0,
33 +
    n_obs,
34 +
    [&Y,
35 +
     &X,
36 +
     &W,
37 +
     &ybar,
38 +
     &SSE,
39 +
     &SST
40 +
    ] (unsigned int d){
41 +
      RcppThread::checkUserInterrupt();
21 42
22 -
    R_CheckUserInterrupt();
43 +
      // Yhat = X %*% W. But doing it funny below to optimize calculation
44 +
      double sse = 0;
45 +
      double sst = 0;
23 46
24 -
    // Yhat = X %*% W. But doing it funny below to optimize calculation
25 -
    double sse = 0;
26 -
    double sst = 0;
47 +
      for(int v = 0; v < W.n_cols; v++ ){
48 +
        double Yhat = 0;
27 49
28 -
    for(int v = 0; v < n_explicit_vars; v++ ){
29 -
      double Yhat = 0;
50 +
        for(int k = 0; k < X.n_cols; k++ ){
51 +
          Yhat = Yhat + X(d , k ) * W(k , v );
52 +
        }
30 53
31 -
      for(int k = 0; k < n_latent_vars; k++ ){
32 -
        Yhat = Yhat + X(d , k ) * W(k , v );
33 -
      }
54 +
        sse = sse + ((Y(v, d) - Yhat) * (Y(v, d) - Yhat));
34 55
35 -
      sse = sse + ((Y(d , v) - Yhat) * (Y(d , v) - Yhat));
56 +
        sst = sst + ((Y(v, d) - ybar[ v ]) * (Y(v, d) - ybar[ v ]));
36 57
37 -
      sst = sst + ((Y(d , v) - ybar[ v ]) * (Y(d , v) - ybar[ v ]));
58 +
      }
38 59
39 -
    }
60 +
      SSE = SSE + sse;
40 61
41 -
    SSE = SSE + sse;
62 +
      SST = SST + sst;
63 +
    },
64 +
    threads);
42 65
43 -
    SST = SST + sst;
44 -
  }
45 66
46 67
  result[ 0 ] = SSE;
47 68
  result[ 1 ] = SST;

@@ -12,6 +12,7 @@
Loading
12 12
#'             computation in batches.
13 13
#' @param return_ss_only Logical. Do you want to forego calculating R-squared and
14 14
#'                       only return the sums of squares?
15 +
#' @param threads Integer number of threads for parallelism; defaults to 1.
15 16
#' @return
16 17
#'     If \code{return_ss_only = FALSE}, \code{calc_rsqured} returns a numeric
17 18
#'     scalar R-squared. If \code{return_ss_only = TRUE}, \code{calc_rsqured}
@@ -33,6 +34,11 @@
Loading
33 34
#'     that \code{yhat[[1]]} left multiplies \code{yhat[[2]]}.
34 35
#'
35 36
#' @note
37 +
#'     On some Linux systems, setting \code{threads} greater than 1 for parallelism
38 +
#'     may introduce some imprecision in the calculation. As of this writing, the
39 +
#'     cause is still under investigation. In the meantime setting \code{threads = 1}
40 +
#'     should fix the issue.
41 +
#'
36 42
#'     Setting \code{return_ss_only} to \code{TRUE} is useful for parallel or
37 43
#'     distributed computing for large data sets, particularly when \code{y} is
38 44
#'     a large matrix. However if you do parallel execution you MUST pre-calculate
@@ -67,34 +73,14 @@
Loading
67 73
#'
68 74
#' # multivariate r-squared with yhat as a linear reconstruction of two matrices
69 75
#' calc_rsquared(y = cbind(y, y), yhat = list(x, cbind(w,w)))
70 -
#'
71 -
#' # multivariate r-squared calculated in parallel
72 -
#' \donttest{
73 -
#' library(furrr)
74 -
#'
75 -
#' plan(multiprocess)
76 -
#'
77 -
#' batches <- list(list(y = cbind(y[1:16], y[1:16]), yhat = cbind(yhat[1:16], yhat[1:16])),
78 -
#'                 list(y = cbind(y[17:32], y[17:32]), yhat = cbind(yhat[17:32], yhat[17:32])))
79 -
#'
80 -
#' ybar <- c(mean(y), mean(y))
81 -
#'
82 -
#' ss <- future_map(.x = batches,
83 -
#'                  .f = function(ybatch){
84 -
#'                    calc_rsquared(y = ybatch$y, yhat = ybatch$yhat,
85 -
#'                                  ybar = ybar, return_ss_only = TRUE)
86 -
#'                  },
87 -
#'                  .options = future_options(scheduling = 2))
88 -
#'
89 -
#'
90 -
#' sse <- sum(sapply(ss, function(x) x["sse"]))
91 -
#'
92 -
#' sst <- sum(sapply(ss, function(x) x["sst"]))
93 -
#'
94 -
#' 1 - sse / sst # final r-squared value here
95 -
#'}
96 76
#' @export
97 -
calc_rsquared <- function(y, yhat, ybar = NULL, return_ss_only = FALSE) {
77 +
calc_rsquared <- function(
78 +
  y,
79 +
  yhat,
80 +
  ybar = NULL,
81 +
  return_ss_only = FALSE,
82 +
  threads = 1
83 +
) {
98 84
99 85
  ### Preliminary input checking ----
100 86
  if (! is.logical(return_ss_only) || is.na(return_ss_only))
@@ -121,10 +107,13 @@
Loading
121 107
  if (is.null(ybar))
122 108
    ybar <- Matrix::colMeans(Y)
123 109
124 -
  result <- calc_sum_squares_latent(Y = Y,
125 -
                                    X = Yhat[[1]],
126 -
                                    W = Yhat[[2]],
127 -
                                    ybar = ybar)
110 +
  result <- calc_sum_squares_latent(
111 +
    Y = Y,
112 +
    X = Yhat[[1]],
113 +
    W = Yhat[[2]],
114 +
    ybar = ybar,
115 +
    threads = threads
116 +
  )
128 117
129 118
  if (return_ss_only) {
130 119

Everything is accounted for!

No changes detected that need to be reviewed.
What changes does Codecov check for?
Lines, not adjusted in diff, that have changed coverage data.
Files that introduced coverage data that had none before.
Files that have missing coverage data that once were tracked.

20 Commits

Hiding 1 contexual commits
Hiding 2 contexual commits
Hiding 2 contexual commits
+1
+1
Files Coverage
R/calc_rsquared.R 0.05% 97.87%
src/calc_sum_squares_latent.cpp 100.00%
Project Totals (2 files) 98.29%
Loading