TommyJones / mvrsquared

@@ -1,47 +1,63 @@
Loading
1 -
1 +
// [[Rcpp::plugins(cpp11)]]
2 +
// [[Rcpp::depends(RcppThread)]]
2 3
// [[Rcpp::depends(RcppArmadillo)]]
4 +
#include "RcppThread.h"
3 5
#include <RcppArmadillo.h>
4 6
#include <cmath>
5 7
#define ARMA_64BIT_WORD
6 8
using namespace Rcpp ;
7 9
8 10
// [[Rcpp::export]]
9 -
NumericVector calc_sum_squares_latent(arma::sp_mat Y, NumericMatrix X, NumericMatrix W, NumericVector ybar) {
10 -
11 -
  int n_obs = Y.n_rows; // number of observations
12 -
  int n_latent_vars = X.ncol(); // number of latent variables
13 -
  int n_explicit_vars = W.ncol(); // number of explicit variables
11 +
NumericVector calc_sum_squares_latent(
12 +
    arma::sp_mat Y,
13 +
    NumericMatrix X,
14 +
    NumericMatrix W,
15 +
    NumericVector ybar,
16 +
    int threads
17 +
) {
18 +
19 +
  int n_obs = Y.n_cols; // number of observations
14 20
  NumericVector result(2); // final result
15 21
  double SSE = 0; // sum of squared errors across all documents
16 22
  double SST = 0; // total sum of squares across all documents
17 23
18 24
19 25
  // for each observations...
20 -
  for(int d = 0; d < n_obs; d++){
26 +
  RcppThread::parallelFor(
27 +
    0,
28 +
    n_obs,
29 +
    [&Y,
30 +
     &X,
31 +
     &W,
32 +
     &ybar,
33 +
     &SSE,
34 +
     &SST
35 +
    ] (unsigned int d){
36 +
      RcppThread::checkUserInterrupt();
21 37
22 -
    R_CheckUserInterrupt();
38 +
      // Yhat = X %*% W. But doing it funny below to optimize calculation
39 +
      double sse = 0;
40 +
      double sst = 0;
23 41
24 -
    // Yhat = X %*% W. But doing it funny below to optimize calculation
25 -
    double sse = 0;
26 -
    double sst = 0;
42 +
      for(int v = 0; v < W.ncol(); v++ ){
43 +
        double Yhat = 0;
27 44
28 -
    for(int v = 0; v < n_explicit_vars; v++ ){
29 -
      double Yhat = 0;
45 +
        for(int k = 0; k < X.ncol(); k++ ){
46 +
          Yhat = Yhat + X(d , k ) * W(k , v );
47 +
        }
30 48
31 -
      for(int k = 0; k < n_latent_vars; k++ ){
32 -
        Yhat = Yhat + X(d , k ) * W(k , v );
33 -
      }
49 +
        sse = sse + ((Y(v, d) - Yhat) * (Y(v, d) - Yhat));
34 50
35 -
      sse = sse + ((Y(d , v) - Yhat) * (Y(d , v) - Yhat));
51 +
        sst = sst + ((Y(v, d) - ybar[ v ]) * (Y(v, d) - ybar[ v ]));
36 52
37 -
      sst = sst + ((Y(d , v) - ybar[ v ]) * (Y(d , v) - ybar[ v ]));
53 +
      }
38 54
39 -
    }
55 +
      SSE = SSE + sse;
40 56
41 -
    SSE = SSE + sse;
57 +
      SST = SST + sst;
58 +
    },
59 +
    threads);
42 60
43 -
    SST = SST + sst;
44 -
  }
45 61
46 62
  result[ 0 ] = SSE;
47 63
  result[ 1 ] = SST;

@@ -12,6 +12,7 @@
Loading
12 12
#'             computation in batches.
13 13
#' @param return_ss_only Logical. Do you want to forego calculating R-squared and
14 14
#'                       only return the sums of squares?
15 +
#' @param threads Integer number of threads for parallelism; defaults to 1.
15 16
#' @return
16 17
#'     If \code{return_ss_only = FALSE}, \code{calc_rsqured} returns a numeric
17 18
#'     scalar R-squared. If \code{return_ss_only = TRUE}, \code{calc_rsqured}
@@ -67,34 +68,14 @@
Loading
67 68
#'
68 69
#' # multivariate r-squared with yhat as a linear reconstruction of two matrices
69 70
#' calc_rsquared(y = cbind(y, y), yhat = list(x, cbind(w,w)))
70 -
#'
71 -
#' # multivariate r-squared calculated in parallel
72 -
#' \donttest{
73 -
#' library(furrr)
74 -
#'
75 -
#' plan(multiprocess)
76 -
#'
77 -
#' batches <- list(list(y = cbind(y[1:16], y[1:16]), yhat = cbind(yhat[1:16], yhat[1:16])),
78 -
#'                 list(y = cbind(y[17:32], y[17:32]), yhat = cbind(yhat[17:32], yhat[17:32])))
79 -
#'
80 -
#' ybar <- c(mean(y), mean(y))
81 -
#'
82 -
#' ss <- future_map(.x = batches,
83 -
#'                  .f = function(ybatch){
84 -
#'                    calc_rsquared(y = ybatch$y, yhat = ybatch$yhat,
85 -
#'                                  ybar = ybar, return_ss_only = TRUE)
86 -
#'                  },
87 -
#'                  .options = future_options(scheduling = 2))
88 -
#'
89 -
#'
90 -
#' sse <- sum(sapply(ss, function(x) x["sse"]))
91 -
#'
92 -
#' sst <- sum(sapply(ss, function(x) x["sst"]))
93 -
#'
94 -
#' 1 - sse / sst # final r-squared value here
95 -
#'}
96 71
#' @export
97 -
calc_rsquared <- function(y, yhat, ybar = NULL, return_ss_only = FALSE) {
72 +
calc_rsquared <- function(
73 +
  y,
74 +
  yhat,
75 +
  ybar = NULL,
76 +
  return_ss_only = FALSE,
77 +
  threads = 1
78 +
) {
98 79
99 80
  ### Preliminary input checking ----
100 81
  if (! is.logical(return_ss_only) || is.na(return_ss_only))
@@ -121,10 +102,13 @@
Loading
121 102
  if (is.null(ybar))
122 103
    ybar <- Matrix::colMeans(Y)
123 104
124 -
  result <- calc_sum_squares_latent(Y = Y,
125 -
                                    X = Yhat[[1]],
126 -
                                    W = Yhat[[2]],
127 -
                                    ybar = ybar)
105 +
  result <- calc_sum_squares_latent(
106 +
    Y = t(Y), # transpose so we can take advantage of column major form
107 +
    X = Yhat[[1]],
108 +
    W = Yhat[[2]],
109 +
    ybar = ybar,
110 +
    threads = threads
111 +
  )
128 112
129 113
  if (return_ss_only) {
130 114
Files Coverage
R/calc_rsquared.R 97.87%
src/calc_sum_squares_latent.cpp 100.00%
Project Totals (2 files) 98.28%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading