1
#' Correspondence analysis of a document-feature matrix
2
#'
3
#' `textmodel_ca` implements correspondence analysis scaling on a
4
#' [dfm].  The method is a fast/sparse version of function [ca][ca::ca].
5
#' @param x the dfm on which the model will be fit
6
#' @param smooth a smoothing parameter for word counts; defaults to zero.
7
#' @param nd  Number of dimensions to be included in output; if `NA` (the
8
#'   default) then the maximum possible dimensions are included.
9
#' @param sparse retains the sparsity if set to `TRUE`; set it to
10
#'   `TRUE` if `x` (the [dfm]) is too big to be allocated after
11
#'   converting to dense
12
#' @param residual_floor specifies the threshold for the residual matrix for
13
#'   calculating the truncated svd.Larger value will reduce memory and time cost
14
#'   but might reduce accuracy; only applicable when `sparse = TRUE`
15

16
#' @author Kenneth Benoit and Haiyan Wang
17
#' @references Nenadic, O. & Greenacre, M. (2007). [Correspondence Analysis in R, with Two- and Three-dimensional Graphics:
18
#'   The ca package](http://www.jstatsoft.org/v20/i03/). *Journal of Statistical Software*, 20(3).
19
#'
20
#' @details [svds][RSpectra::svds] in the \pkg{RSpectra} package is applied to
21
#'   enable the fast computation of the SVD.
22
#' @note You may need to set `sparse = TRUE`) and
23
#'   increase the value of `residual_floor` to ignore less important
24
#'   information and hence to reduce the memory cost when you have a very big
25
#'   [dfm].
26
#'   If your attempt to fit the model fails due to the matrix being too large,
27
#'   this is probably because of the memory demands of computing the \eqn{V
28
#'   \times V} residual matrix.  To avoid this, consider increasing the value of
29
#'   `residual_floor` by 0.1, until the model can be fit.
30
#' @return `textmodel_ca()` returns a fitted CA textmodel that is a special
31
#' class of \pkg{ca} object.
32
#' @examples
33
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
34
#' tmod <- textmodel_ca(dfmat)
35
#' summary(tmod)
36
#' @seealso [coef.textmodel_lsa()], [ca][ca::ca]
37
#' @importFrom quanteda as.dfm
38
#' @export
39
textmodel_ca <- function(x, smooth = 0, nd = NA, sparse = FALSE,
40
                         residual_floor = 0.1) {
41 1
    UseMethod("textmodel_ca")
42
}
43

44
#' @export
45
textmodel_ca.default <- function(x, smooth = 0, nd = NA, sparse = FALSE,
46
                                 residual_floor = 0.1) {
47 1
    stop(friendly_class_undefined_message(class(x), "textmodel_ca"))
48
}
49

50
#' @export
51
textmodel_ca.dfm <- function(x, smooth = 0, nd = NA, sparse = FALSE,
52
                             residual_floor = 0.1) {
53 1
    x <- as.dfm(x)
54 1
    if (!sum(x)) stop(message_error("dfm_empty"))
55

56 1
    x <- x + smooth  # smooth by the specified amount
57

58 1
    I <- dim(x)[1]
59 1
    J <- dim(x)[2]
60 1
    rn <- dimnames(x)[[1]]
61 1
    cn <- dimnames(x)[[2]]
62

63
    # default value of rank k
64 1
    if (is.na(nd)){
65
        #nd <- max(floor(min(I, J)/4), 1)
66 1
        nd <- max(floor(3 * log(min(I, J))), 1)
67
    } else {
68 1
        nd.max <- min(dim(x)) - 1
69 0
        if (nd > nd.max ) nd <- nd.max
70
    }
71 1
    nd0 <- nd
72

73 1
    n <- sum(x)
74 1
    P <- x / n
75 1
    rm <- rowSums(P)
76 1
    cm <- colSums(P)
77

78 1
    if (sparse == FALSE){
79
        # generally fast for a not-so-large dfm
80 1
        eP <- Matrix::tcrossprod(rm, cm)
81 1
        S  <- (P - eP) / sqrt(eP)
82
    } else {
83
        # keep the residual matrix sparse
84 1
        S <- as(qatd_cpp_ca(P, residual_floor / sqrt(n)), 'dgCMatrix')
85
    }
86

87 1
    dec <- RSpectra::svds(S, nd)
88

89 1
    chimat <- S ^ 2 * n
90 1
    sv     <- dec$d[seq_len(nd)]
91 1
    u      <- dec$u
92 1
    v      <- dec$v
93 1
    ev     <- sv ^ 2
94 1
    cumev  <- cumsum(ev)
95

96
    # Inertia:
97 1
    totin <- sum(ev)
98 1
    rin <- rowSums(S ^ 2)
99 1
    cin <- colSums(S ^ 2)
100

101
    # chidist
102 1
    rachidist <- sqrt(rin / rm)
103 1
    cachidist <- sqrt(cin / cm)
104 1
    rchidist <- rachidist
105 1
    cchidist <- cachidist
106

107
    # Standard coordinates:
108 1
    phi <- as.matrix(u[,seq_len(nd)]) / sqrt(rm)
109 1
    rownames(phi) <- rn
110 1
    colnames(phi) <- paste("Dim", seq_len(ncol(phi)), sep="")
111

112 1
    gam <- as.matrix(v[,1:nd]) / sqrt(cm)
113 1
    rownames(gam) <- cn
114 1
    colnames(gam) <- paste("Dim", seq_len(ncol(gam)), sep="")
115
    # remove attributes
116 1
    attr(rm, "names") <- NULL
117 1
    attr(cm, "names") <- NULL
118 1
    attr(rchidist, "names") <- NULL
119 1
    attr(cchidist, "names") <- NULL
120 1
    attr(rin, "names") <- NULL
121 1
    attr(cin, "names") <- NULL
122

123
    #results
124 1
    ca_model <-
125 1
        list(sv         = sv,
126 1
             nd         = nd0,
127 1
             rownames   = rn,
128 1
             rowmass    = rm,
129 1
             rowdist    = rchidist,
130 1
             rowinertia = rin,
131 1
             rowcoord   = phi,
132 1
             rowsup     = logical(0),
133 1
             colnames   = cn,
134 1
             colmass    = cm,
135 1
             coldist    = cchidist,
136 1
             colinertia = cin,
137 1
             colcoord   = gam,
138 1
             colsup     = logical(0),
139 1
             call       = match.call())
140 1
    class(ca_model) <- c("textmodel_ca", "ca", "list")
141 1
    return(ca_model)
142
}
143

144
#' Extract model coefficients from a fitted textmodel_ca object
145
#'
146
#' `coef()` extract model coefficients from a fitted `textmodel_ca`
147
#' object.  `coefficients()` is an alias.
148
#' @param object a fitted [textmodel_ca] object
149
#' @param doc_dim,feat_dim the document and feature dimension scores to be
150
#'   extracted
151
#' @param ... unused
152
#' @keywords textmodel internal
153
#' @export
154
coef.textmodel_ca <- function(object, doc_dim = 1, feat_dim = 1, ...) {
155 1
    list(coef_feature = object$colcoord[, feat_dim],
156 1
         coef_feature_se = rep(NA, length(object$colnames)),
157 1
         coef_document = object$rowcoord[, doc_dim],
158 1
         coef_document_se = rep(NA, length(object$rownames)))
159
}
160

161
#' @rdname coef.textmodel_ca
162
#' @export
163
coefficients.textmodel_ca <- function(object, doc_dim = 1, feat_dim = 1, ...) {
164 0
    UseMethod('coef')
165
}

Read our documentation on viewing source code .

Loading