Re-implement textmodel_svmlim() without dependencies
1 |
#' Latent Semantic Analysis
|
|
2 |
#'
|
|
3 |
#' Fit the Latent Semantic Analysis scaling model to a [dfm], which may be
|
|
4 |
#' weighted (for instance using [quanteda::dfm_tfidf()]).
|
|
5 |
#' @param x the [dfm] on which the model will be fit
|
|
6 |
#' @param nd the number of dimensions to be included in output
|
|
7 |
#' @param margin margin to be smoothed by the SVD
|
|
8 |
#' @author Haiyan Wang and Kohei Watanabe
|
|
9 |
#' @details [svds][RSpectra::svds] in the \pkg{RSpectra} package is applied to
|
|
10 |
#' enable the fast computation of the SVD.
|
|
11 |
#' @note The number of dimensions `nd` retained in LSA is an empirical
|
|
12 |
#' issue. While a reduction in \eqn{k} can remove much of the noise, keeping
|
|
13 |
#' too few dimensions or factors may lose important information.
|
|
14 |
#' @references
|
|
15 |
#' Rosario, B. (2000).
|
|
16 |
#' [Latent
|
|
17 |
#' Semantic Indexing: An Overview](http://www.cse.msu.edu/~cse960/Papers/LSI/LSI.pdf). *Technical report INFOSYS 240 Spring
|
|
18 |
#' Paper, University of California, Berkeley.*
|
|
19 |
#'
|
|
20 |
#' Deerwester, S., Dumais, S.T., Furnas, G.W., Landauer, T.K., &
|
|
21 |
#' Harshman, R. (1990). [Indexing
|
|
22 |
#' by Latent Semantic Analysis](https://search.proquest.com/docview/1301252034). *Journal of the American Society for
|
|
23 |
#' Information Science*, 41(6): 391.
|
|
24 |
#' @examples
|
|
25 |
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
|
|
26 |
#' # create an LSA space and return its truncated representation in the low-rank space
|
|
27 |
#' tmod <- textmodel_lsa(dfmat[1:10, ])
|
|
28 |
#' head(tmod$docs)
|
|
29 |
#'
|
|
30 |
#' # matrix in low_rank LSA space
|
|
31 |
#' tmod$matrix_low_rank[,1:5]
|
|
32 |
#'
|
|
33 |
#' # fold queries into the space generated by dfmat[1:10,]
|
|
34 |
#' # and return its truncated versions of its representation in the new low-rank space
|
|
35 |
#' pred <- predict(tmod, newdata = dfmat[11:14, ])
|
|
36 |
#' pred$docs_newspace
|
|
37 |
#'
|
|
38 |
#' @keywords textmodel experimental
|
|
39 |
#' @seealso [predict.textmodel_lsa()], [coef.textmodel_lsa()]
|
|
40 |
#' @importFrom quanteda as.dfm
|
|
41 |
#' @export
|
|
42 |
textmodel_lsa <- function(x, nd = 10, margin = c("both", "documents", "features")) { |
|
43 | 1 |
UseMethod("textmodel_lsa") |
44 |
}
|
|
45 |
|
|
46 |
#' @export
|
|
47 |
textmodel_lsa.dfm <- function(x, nd = 10, margin = c("both", "documents", "features")) { |
|
48 |
|
|
49 | 1 |
x <- as.dfm(x) |
50 | 1 |
if (!sum(x)) stop(message_error("dfm_empty")) |
51 | 1 |
margin <- match.arg(margin) |
52 |
|
|
53 |
if (nd > min(nrow(x), ncol(x))) nd <- min(nrow(x), ncol(x)) |
|
54 |
if (nd < 2) nd <- 2 |
|
55 |
|
|
56 | 1 |
x <- as(x, "dgCMatrix") |
57 | 1 |
if (margin == "documents") { |
58 | 1 |
dec <- RSpectra::svds(x, k = nd, nu = 0, nv = nd) |
59 | 1 |
} else if (margin == "features") { |
60 | 1 |
dec <- RSpectra::svds(x, k = nd, nu = nd, nv = 0) |
61 |
} else { |
|
62 | 1 |
dec <- RSpectra::svds(x, nd) |
63 |
}
|
|
64 |
|
|
65 | 1 |
if (any(dec$d <= sqrt(.Machine$double.eps))) |
66 |
warning("[lsa] - there are singular values which are zero") |
|
67 |
|
|
68 | 1 |
result <- list(sk = dec$d, docs = NULL, features = NULL) |
69 |
|
|
70 | 1 |
if (margin == "documents") { |
71 | 1 |
result$features <- dec$v |
72 | 1 |
rownames(result$features) <- colnames(x) |
73 | 1 |
result$matrix_low_rank <- t(dec$v * dec$d) |
74 | 1 |
rownames(result$matrix_low_rank) <- |
75 | 1 |
paste0(quanteda_options("base_compname"), seq_len(nrow(result$matrix_low_rank))) |
76 | 1 |
colnames(result$matrix_low_rank) <- colnames(x) |
77 | 1 |
} else if (margin == "features") { |
78 | 1 |
result$docs <- dec$u |
79 | 1 |
rownames(result$docs) <- rownames(x) |
80 | 1 |
result$matrix_low_rank <- dec$u * dec$d |
81 | 1 |
rownames(result$matrix_low_rank) <- rownames(x) |
82 | 1 |
colnames(result$matrix_low_rank) <- |
83 | 1 |
paste0(quanteda_options("base_compname"), seq_len(ncol(result$matrix_low_rank))) |
84 |
} else { |
|
85 | 1 |
result$docs <- dec$u |
86 | 1 |
result$features <- dec$v |
87 | 1 |
rownames(result$docs) <- rownames(x) |
88 | 1 |
rownames(result$features) <- colnames(x) |
89 | 1 |
result$matrix_low_rank <- dec$u %*% diag(dec$d) %*% t(dec$v) |
90 | 1 |
rownames(result$matrix_low_rank) <- rownames(x) |
91 | 1 |
colnames(result$matrix_low_rank) <- colnames(x) |
92 |
}
|
|
93 |
|
|
94 |
# keep the input matrix
|
|
95 | 1 |
result$data <- x |
96 | 1 |
class(result) <- c("textmodel_lsa") |
97 |
|
|
98 |
# return the LSA space
|
|
99 | 1 |
return(result) |
100 |
}
|
|
101 |
|
|
102 |
# Post-estimation methods ---------------
|
|
103 |
|
|
104 |
#' Post-estimations methods for textmodel_lsa
|
|
105 |
#'
|
|
106 |
#' Post-estimation methods for fitted [textmodel_lsa] objects.
|
|
107 |
#' @name textmodel_lsa-postestimation
|
|
108 |
#' @param object,x previously fitted [textmodel_lsa] object
|
|
109 |
#' @param newdata new matrix to be transformed into the lsa space
|
|
110 |
#' @param ... unused
|
|
111 |
#' @return `predict()` returns a predicted [textmodel_lsa] object, projecting the patterns onto
|
|
112 |
#' new data.
|
|
113 |
#' @importFrom stats predict
|
|
114 |
#' @method predict textmodel_lsa
|
|
115 |
#' @keywords textmodel internal
|
|
116 |
#' @export
|
|
117 |
predict.textmodel_lsa <- function(object, newdata = NULL, ...) { |
|
118 |
|
|
119 | 1 |
call <- match.call() |
120 |
if (is.null(newdata)) newdata <- object$data |
|
121 |
|
|
122 | 1 |
tsa <- newdata %*% object$features %*% solve(diag(object$sk)) |
123 | 1 |
transfed <- t(object$features %*% diag(object$sk) %*% t(tsa)) |
124 |
|
|
125 | 1 |
colnames(transfed) <- rownames(object$features) |
126 | 1 |
rownames(transfed) <- rownames(newdata) |
127 |
|
|
128 | 1 |
result <- list(docs_newspace = tsa, |
129 | 1 |
matrix_low_rank = transfed) |
130 | 1 |
rownames(result$docs_newspace) <- rownames(newdata) |
131 | 1 |
class(result) <- "textmodel_lsa_predicted" |
132 | 1 |
return (result) |
133 |
}
|
|
134 |
|
|
135 |
#' @rdname textmodel_lsa-postestimation
|
|
136 |
#' @method as.dfm textmodel_lsa
|
|
137 |
#' @export
|
|
138 |
as.dfm.textmodel_lsa <- function(x) { |
|
139 | 1 |
as.dfm(x$matrix_low_rank) |
140 |
}
|
|
141 |
|
|
142 |
#' @rdname textmodel_lsa-postestimation
|
|
143 |
#' @return `coef.textmodel_lsa` extracts model coefficients from a fitted
|
|
144 |
#' [textmodel_ca] object.
|
|
145 |
#' @param doc_dim,feat_dim the document and feature dimension scores to be
|
|
146 |
#' extracted
|
|
147 |
#' @export
|
|
148 |
coef.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) { |
|
149 |
list(coef_feature = object$features[, feat_dim], |
|
150 |
coef_feature_se = rep(NA, dim(object$features)[1]), |
|
151 |
coef_document = object$docs[, doc_dim], |
|
152 |
coef_document_se = rep(NA, dim(object$docs)[1])) |
|
153 |
}
|
|
154 |
|
|
155 |
#' @rdname textmodel_lsa-postestimation
|
|
156 |
#' @export
|
|
157 |
coefficients.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) { |
|
158 |
UseMethod('coef', ...) |
|
159 |
}
|
Read our documentation on viewing source code .