1
#' Latent Semantic Analysis
2
#'
3
#' Fit the Latent Semantic Analysis scaling model to a [dfm], which may be
4
#' weighted (for instance using [quanteda::dfm_tfidf()]).
5
#' @param x the [dfm] on which the model will be fit
6
#' @param nd  the number of dimensions to be included in output
7
#' @param margin margin to be smoothed by the SVD
8
#' @author Haiyan Wang and Kohei Watanabe
9
#' @details [svds][RSpectra::svds] in the \pkg{RSpectra} package is applied to
10
#'   enable the fast computation of the SVD.
11
#' @note  The number of dimensions `nd` retained in LSA is an empirical
12
#'   issue. While a reduction in \eqn{k} can remove much of the noise, keeping
13
#'   too few dimensions or factors may lose important information.
14
#' @references
15
#'   Rosario, B. (2000).
16
#'   [Latent
17
#'   Semantic Indexing: An Overview](http://www.cse.msu.edu/~cse960/Papers/LSI/LSI.pdf). *Technical report INFOSYS 240 Spring
18
#'   Paper, University of California, Berkeley.*
19
#'
20
#'   Deerwester, S., Dumais, S.T., Furnas, G.W., Landauer, T.K., &
21
#'   Harshman, R. (1990). [Indexing
22
#'   by Latent Semantic Analysis](https://search.proquest.com/docview/1301252034). *Journal of the American Society for
23
#'   Information Science*, 41(6): 391.
24
#' @examples
25
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
26
#' # create an LSA space and return its truncated representation in the low-rank space
27
#' tmod <- textmodel_lsa(dfmat[1:10, ])
28
#' head(tmod$docs)
29
#'
30
#' # matrix in low_rank LSA space
31
#' tmod$matrix_low_rank[,1:5]
32
#'
33
#' # fold queries into the space generated by dfmat[1:10,]
34
#' # and return its truncated versions of its representation in the new low-rank space
35
#' pred <- predict(tmod, newdata = dfmat[11:14, ])
36
#' pred$docs_newspace
37
#'
38
#' @keywords textmodel experimental
39
#' @seealso [predict.textmodel_lsa()], [coef.textmodel_lsa()]
40
#' @importFrom quanteda as.dfm
41
#' @export
42
textmodel_lsa <- function(x, nd = 10, margin = c("both", "documents", "features")) {
43 1
    UseMethod("textmodel_lsa")
44
}
45

46
#' @export
47
textmodel_lsa.dfm <- function(x, nd = 10, margin = c("both", "documents", "features")) {
48

49 1
    x <- as.dfm(x)
50 1
    if (!sum(x)) stop(message_error("dfm_empty"))
51 1
    margin <- match.arg(margin)
52

53 0
    if (nd > min(nrow(x), ncol(x))) nd <- min(nrow(x), ncol(x))
54 0
    if (nd < 2) nd <- 2
55

56 1
    x <- as(x, "dgCMatrix")
57 1
    if (margin == "documents") {
58 1
        dec <- RSpectra::svds(x, k = nd, nu = 0, nv = nd)
59 1
    } else if (margin == "features") {
60 1
        dec <- RSpectra::svds(x, k = nd, nu = nd, nv = 0)
61
    } else {
62 1
        dec <- RSpectra::svds(x, nd)
63
    }
64

65 1
    if (any(dec$d <= sqrt(.Machine$double.eps)))
66 0
        warning("[lsa] - there are singular values which are zero")
67

68 1
    result <- list(sk = dec$d, docs = NULL, features = NULL)
69

70 1
    if (margin == "documents") {
71 1
        result$features <- dec$v
72 1
        rownames(result$features) <- colnames(x)
73 1
        result$matrix_low_rank <- t(dec$v * dec$d)
74 1
        rownames(result$matrix_low_rank) <-
75 1
            paste0(quanteda_options("base_compname"), seq_len(nrow(result$matrix_low_rank)))
76 1
        colnames(result$matrix_low_rank) <- colnames(x)
77 1
    } else if (margin == "features") {
78 1
        result$docs <- dec$u
79 1
        rownames(result$docs) <- rownames(x)
80 1
        result$matrix_low_rank <- dec$u * dec$d
81 1
        rownames(result$matrix_low_rank) <- rownames(x)
82 1
        colnames(result$matrix_low_rank) <-
83 1
            paste0(quanteda_options("base_compname"), seq_len(ncol(result$matrix_low_rank)))
84
    } else {
85 1
        result$docs <- dec$u
86 1
        result$features <- dec$v
87 1
        rownames(result$docs) <- rownames(x)
88 1
        rownames(result$features) <- colnames(x)
89 1
        result$matrix_low_rank <- dec$u %*% diag(dec$d) %*% t(dec$v)
90 1
        rownames(result$matrix_low_rank) <- rownames(x)
91 1
        colnames(result$matrix_low_rank) <- colnames(x)
92
    }
93

94
    # keep the input matrix
95 1
    result$data <- x
96 1
    class(result) <- c("textmodel_lsa")
97

98
    # return the LSA space
99 1
    return(result)
100
}
101

102
# Post-estimation methods ---------------
103

104
#' Post-estimations methods for textmodel_lsa
105
#'
106
#' Post-estimation methods for fitted [textmodel_lsa] objects.
107
#' @name textmodel_lsa-postestimation
108
#' @param object,x previously fitted [textmodel_lsa] object
109
#' @param newdata new matrix to be transformed into the lsa space
110
#' @param ... unused
111
#' @return `predict()` returns a predicted [textmodel_lsa] object, projecting the patterns onto
112
#' new data.
113
#' @importFrom stats predict
114
#' @method predict textmodel_lsa
115
#' @keywords textmodel internal
116
#' @export
117
predict.textmodel_lsa <- function(object, newdata = NULL, ...) {
118

119 1
    call <- match.call()
120 0
    if (is.null(newdata)) newdata <- object$data
121

122 1
    tsa <-  newdata %*% object$features %*% solve(diag(object$sk))
123 1
    transfed <- t(object$features %*% diag(object$sk) %*% t(tsa))
124

125 1
    colnames(transfed) <- rownames(object$features)
126 1
    rownames(transfed) <- rownames(newdata)
127

128 1
    result <- list(docs_newspace = tsa,
129 1
                   matrix_low_rank = transfed)
130 1
    rownames(result$docs_newspace) <- rownames(newdata)
131 1
    class(result) <- "textmodel_lsa_predicted"
132 1
    return (result)
133
}
134

135
#' @rdname textmodel_lsa-postestimation
136
#' @method as.dfm textmodel_lsa
137
#' @export
138
as.dfm.textmodel_lsa <- function(x) {
139 1
    as.dfm(x$matrix_low_rank)
140
}
141

142
#' @rdname textmodel_lsa-postestimation
143
#' @return `coef.textmodel_lsa` extracts model coefficients from a fitted
144
#'   [textmodel_ca] object.
145
#' @param doc_dim,feat_dim the document and feature dimension scores to be
146
#'   extracted
147
#' @export
148
coef.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
149 0
    list(coef_feature = object$features[, feat_dim],
150 0
         coef_feature_se = rep(NA, dim(object$features)[1]),
151 0
         coef_document = object$docs[, doc_dim],
152 0
         coef_document_se = rep(NA, dim(object$docs)[1]))
153
}
154

155
#' @rdname textmodel_lsa-postestimation
156
#' @export
157
coefficients.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
158 0
    UseMethod('coef', ...)
159
}

Read our documentation on viewing source code .

Loading