Re-implement textmodel_svmlim() without dependencies
1 |
# main methods --------------
|
|
2 |
|
|
3 |
#' Wordfish text model
|
|
4 |
#'
|
|
5 |
#' Estimate Slapin and Proksch's (2008) "wordfish" Poisson scaling model of
|
|
6 |
#' one-dimensional document positions using conditional maximum likelihood.
|
|
7 |
#' @importFrom Rcpp evalCpp
|
|
8 |
#' @param x the dfm on which the model will be fit
|
|
9 |
#' @param dir set global identification by specifying the indexes for a pair of
|
|
10 |
#' documents such that \eqn{\hat{\theta}_{dir[1]} < \hat{\theta}_{dir[2]}}.
|
|
11 |
#' @param priors prior precisions for the estimated parameters \eqn{\alpha_i},
|
|
12 |
#' \eqn{\psi_j}, \eqn{\beta_j}, and \eqn{\theta_i}, where \eqn{i} indexes
|
|
13 |
#' documents and \eqn{j} indexes features
|
|
14 |
#' @param tol tolerances for convergence. The first value is a convergence
|
|
15 |
#' threshold for the log-posterior of the model, the second value is the
|
|
16 |
#' tolerance in the difference in parameter values from the iterative
|
|
17 |
#' conditional maximum likelihood (from conditionally estimating
|
|
18 |
#' document-level, then feature-level parameters).
|
|
19 |
#' @param dispersion sets whether a quasi-Poisson quasi-likelihood should be
|
|
20 |
#' used based on a single dispersion parameter (`"poisson"`), or
|
|
21 |
#' quasi-Poisson (`"quasipoisson"`)
|
|
22 |
#' @param dispersion_level sets the unit level for the dispersion parameter,
|
|
23 |
#' options are `"feature"` for term-level variances, or `"overall"`
|
|
24 |
#' for a single dispersion parameter
|
|
25 |
#' @param dispersion_floor constraint for the minimal underdispersion multiplier
|
|
26 |
#' in the quasi-Poisson model. Used to minimize the distorting effect of
|
|
27 |
#' terms with rare term or document frequencies that appear to be severely
|
|
28 |
#' underdispersed. Default is 0, but this only applies if `dispersion =
|
|
29 |
#' "quasipoisson"`.
|
|
30 |
#' @param sparse specifies whether the `"dfm"` is coerced to dense. While
|
|
31 |
#' setting this to `TRUE` will make it possible to handle larger dfm
|
|
32 |
#' objects (and make execution faster), it will generate slightly different
|
|
33 |
#' results each time, because the sparse SVD routine has a stochastic element.
|
|
34 |
#' @param abs_err specifies how the convergence is considered
|
|
35 |
#' @param svd_sparse uses svd to initialize the starting values of theta,
|
|
36 |
#' only applies when `sparse = TRUE`
|
|
37 |
#' @param residual_floor specifies the threshold for residual matrix when
|
|
38 |
#' calculating the svds, only applies when `sparse = TRUE`
|
|
39 |
#' @return An object of class `textmodel_fitted_wordfish`. This is a list
|
|
40 |
#' containing: \item{dir}{global identification of the dimension}
|
|
41 |
#' \item{theta}{estimated document positions} \item{alpha}{estimated document
|
|
42 |
#' fixed effects} \item{beta}{estimated feature marginal effects}
|
|
43 |
#' \item{psi}{estimated word fixed effects} \item{docs}{document labels}
|
|
44 |
#' \item{features}{feature labels} \item{sigma}{regularization parameter for
|
|
45 |
#' betas in Poisson form} \item{ll}{log likelihood at convergence}
|
|
46 |
#' \item{se.theta}{standard errors for theta-hats} \item{x}{dfm to which
|
|
47 |
#' the model was fit}
|
|
48 |
#' @details The returns match those of Will Lowe's R implementation of
|
|
49 |
#' `wordfish` (see the austin package), except that here we have renamed
|
|
50 |
#' `words` to be `features`. (This return list may change.) We
|
|
51 |
#' have also followed the practice begun with Slapin and Proksch's early
|
|
52 |
#' implementation of the model that used a regularization parameter of
|
|
53 |
#' se\eqn{(\sigma) = 3}, through the third element in `priors`.
|
|
54 |
#'
|
|
55 |
#' @note In the rare situation where a warning message of "The algorithm did not
|
|
56 |
#' converge." shows up, removing some documents may work.
|
|
57 |
#' @seealso [predict.textmodel_wordfish()]
|
|
58 |
#' @references Slapin, J. & Proksch, S.O. (2008).
|
|
59 |
#' [A Scaling Model
|
|
60 |
#' for Estimating Time-Series Party Positions from Texts](https://doi.org/10.1111/j.1540-5907.2008.00338.x). *American
|
|
61 |
#' Journal of Political Science*, 52(3), 705--772.
|
|
62 |
#'
|
|
63 |
#' Lowe, W. & Benoit, K.R. (2013). [Validating
|
|
64 |
#' Estimates of Latent Traits from Textual Data Using Human Judgment as a Benchmark](http://doi.org/10.1093/pan/mpt002).
|
|
65 |
#' *Political Analysis*, 21(3), 298--313.
|
|
66 |
#' @author Benjamin Lauderdale, Haiyan Wang, and Kenneth Benoit
|
|
67 |
#' @examples
|
|
68 |
#' (tmod1 <- textmodel_wordfish(data_dfm_lbgexample, dir = c(1,5)))
|
|
69 |
#' summary(tmod1, n = 10)
|
|
70 |
#' coef(tmod1)
|
|
71 |
#' predict(tmod1)
|
|
72 |
#' predict(tmod1, se.fit = TRUE)
|
|
73 |
#' predict(tmod1, interval = "confidence")
|
|
74 |
#'
|
|
75 |
#' \dontrun{
|
|
76 |
#' library("quanteda")
|
|
77 |
#' dfmat <- dfm(data_corpus_irishbudget2010)
|
|
78 |
#' (tmod2 <- textmodel_wordfish(dfmat, dir = c(6,5)))
|
|
79 |
#' (tmod3 <- textmodel_wordfish(dfmat, dir = c(6,5),
|
|
80 |
#' dispersion = "quasipoisson", dispersion_floor = 0))
|
|
81 |
#' (tmod4 <- textmodel_wordfish(dfmat, dir = c(6,5),
|
|
82 |
#' dispersion = "quasipoisson", dispersion_floor = .5))
|
|
83 |
#' plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
|
|
84 |
#' xlim = c(0, 1.0), ylim = c(0, 1.0))
|
|
85 |
#' plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
|
|
86 |
#' xlim = c(0, 1.0), ylim = c(0, 1.0), type = "n")
|
|
87 |
#' underdispersedTerms <- sample(which(tmod3$phi < 1.0), 5)
|
|
88 |
#' which(featnames(dfmat) %in% names(topfeatures(dfmat, 20)))
|
|
89 |
#' text(tmod3$phi, tmod4$phi, tmod3$features,
|
|
90 |
#' cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "grey90")
|
|
91 |
#' text(tmod3$phi['underdispersedTerms'], tmod4$phi['underdispersedTerms'],
|
|
92 |
#' tmod3$features['underdispersedTerms'],
|
|
93 |
#' cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "black")
|
|
94 |
#' if (requireNamespace("austin")) {
|
|
95 |
#' tmod5 <- austin::wordfish(quanteda::as.wfm(dfmat), dir = c(6, 5))
|
|
96 |
#' cor(tmod1$theta, tmod5$theta)
|
|
97 |
#' }}
|
|
98 |
#' @importFrom quanteda as.dfm
|
|
99 |
#' @export
|
|
100 |
textmodel_wordfish <- function(x, dir = c(1, 2), |
|
101 |
priors = c(Inf, Inf, 3, 1), |
|
102 |
tol = c(1e-6, 1e-8), |
|
103 |
dispersion = c("poisson", "quasipoisson"), |
|
104 |
dispersion_level = c("feature", "overall"), |
|
105 |
dispersion_floor = 0, |
|
106 |
sparse = FALSE, |
|
107 |
abs_err = FALSE, |
|
108 |
svd_sparse = TRUE, |
|
109 |
residual_floor = 0.5) { |
|
110 | 1 |
UseMethod("textmodel_wordfish") |
111 |
}
|
|
112 |
|
|
113 |
#' @export
|
|
114 |
textmodel_wordfish.default <- function(x, dir = c(1, 2), |
|
115 |
priors = c(Inf, Inf, 3, 1), |
|
116 |
tol = c(1e-6, 1e-8), |
|
117 |
dispersion = c("poisson", "quasipoisson"), |
|
118 |
dispersion_level = c("feature", "overall"), |
|
119 |
dispersion_floor = 0, |
|
120 |
sparse = FALSE, |
|
121 |
abs_err = FALSE, |
|
122 |
svd_sparse = TRUE, |
|
123 |
residual_floor = 0.5) { |
|
124 | 1 |
stop(friendly_class_undefined_message(class(x), "textmodel_wordfish")) |
125 |
}
|
|
126 |
|
|
127 |
#' @export
|
|
128 |
textmodel_wordfish.dfm <- function(x, dir = c(1, 2), |
|
129 |
priors = c(Inf, Inf, 3, 1), |
|
130 |
tol = c(1e-6, 1e-8), |
|
131 |
dispersion = c("poisson", "quasipoisson"), |
|
132 |
dispersion_level = c("feature", "overall"), |
|
133 |
dispersion_floor = 0, |
|
134 |
sparse = FALSE, |
|
135 |
abs_err = FALSE, |
|
136 |
svd_sparse = TRUE, |
|
137 |
residual_floor = 0.5) { |
|
138 |
|
|
139 | 1 |
x <- as.dfm(x) |
140 | 1 |
if (!sum(x)) stop(message_error("dfm_empty")) |
141 | 1 |
dispersion <- match.arg(dispersion) |
142 | 1 |
dispersion_level <- match.arg(dispersion_level) |
143 |
|
|
144 |
# check that no rows or columns are all zero
|
|
145 | 1 |
empty_docs <- which(ntoken(x) == 0) |
146 | 1 |
if (length(empty_docs)) { |
147 |
catm("Note: removed the following zero-token documents:", |
|
148 |
docnames(x)[empty_docs], "\n") |
|
149 |
x <- x[empty_docs * -1, ] |
|
150 |
}
|
|
151 | 1 |
empty_feats <- which(docfreq(x) == 0) |
152 | 1 |
if (length(empty_feats)) { |
153 |
catm("Note: removed the following zero-count features:", |
|
154 |
featnames(x)[empty_feats], "\n") |
|
155 |
x <- x[, empty_feats * -1] |
|
156 |
}
|
|
157 |
if (length(empty_docs) || length(empty_feats)) catm("\n") |
|
158 |
|
|
159 |
# some error checking
|
|
160 | 1 |
if (length(priors) != 4) |
161 |
stop("priors requires 4 elements") |
|
162 | 1 |
if (length(tol) != 2) |
163 |
stop("tol requires 2 elements") |
|
164 | 1 |
if (!is.numeric(priors) || !is.numeric(tol)) |
165 |
stop("priors and tol must be numeric") |
|
166 | 1 |
if (dispersion_floor < 0 || dispersion_floor > 1.0) |
167 |
stop("dispersion_floor must be between 0 and 1.0") |
|
168 |
|
|
169 | 1 |
if (dispersion == "poisson" && dispersion_floor != 0) |
170 |
warning("dispersion_floor argument ignored for poisson") |
|
171 |
|
|
172 |
# check quasi-poisson settings and translate into numerical values
|
|
173 |
# 1 = Poisson,
|
|
174 |
# 2 = quasi-Poisson, overall dispersion,
|
|
175 |
# 3 = quasi-Poisson, term dispersion,
|
|
176 |
# 4 = quasi-Poisson, term dispersion w/floor
|
|
177 | 1 |
if (dispersion == "poisson") { |
178 | 1 |
disp <- 1L |
179 | 1 |
} else if (dispersion == "quasipoisson" && dispersion_level == "overall") { |
180 | 1 |
disp <- 2L |
181 | 1 |
} else if (dispersion == "quasipoisson" && dispersion_level == "feature") { |
182 | 1 |
if (dispersion_floor) { |
183 |
disp <- 4L |
|
184 |
} else { |
|
185 | 1 |
disp <- 3L |
186 |
}
|
|
187 |
} else { |
|
188 |
stop("Illegal option combination.") |
|
189 |
}
|
|
190 | 1 |
if (sparse == TRUE) { |
191 | 1 |
result <- qatd_cpp_wordfish(x, as.integer(dir), 1 / (priors ^ 2), |
192 | 1 |
tol, disp, |
193 | 1 |
dispersion_floor, abs_err, svd_sparse, |
194 | 1 |
residual_floor)
|
195 |
} else{ |
|
196 | 1 |
result <- qatd_cpp_wordfish_dense(as.matrix(x), |
197 | 1 |
as.integer(dir), 1 / (priors ^ 2), |
198 | 1 |
tol, disp, |
199 | 1 |
dispersion_floor, abs_err) |
200 |
}
|
|
201 |
# NOTE: psi is a 1 x nfeat matrix, not a numeric vector
|
|
202 |
# alpha is a ndoc x 1 matrix, not a numeric vector
|
|
203 | 1 |
if (any(is.nan(result$theta))) |
204 |
warning("Warning: The algorithm did not converge.") |
|
205 |
|
|
206 | 1 |
result <- list( |
207 | 1 |
x = x, |
208 | 1 |
docs = docnames(x), |
209 | 1 |
features = featnames(x), |
210 | 1 |
dir = dir, |
211 | 1 |
dispersion = dispersion, |
212 | 1 |
priors = priors, |
213 | 1 |
theta = as.numeric(result$theta), |
214 | 1 |
beta = as.numeric(result$beta), |
215 | 1 |
psi = as.numeric(result$psi), |
216 | 1 |
alpha = as.numeric(result$alpha), |
217 | 1 |
phi = as.numeric(result$phi), |
218 | 1 |
se.theta = as.numeric(result$thetaSE) , |
219 | 1 |
call = match.call() |
220 |
)
|
|
221 | 1 |
class(result) <- c("textmodel_wordfish", "textmodel", "list") |
222 | 1 |
result |
223 |
}
|
|
224 |
|
|
225 |
#' Prediction from a textmodel_wordfish method
|
|
226 |
#'
|
|
227 |
#' `predict.textmodel_wordfish()` returns estimated document scores and
|
|
228 |
#' confidence intervals. The method is provided for consistency with other
|
|
229 |
#' `textmodel_*()` methods, but does not currently allow prediction on
|
|
230 |
#' out-of-sample data.
|
|
231 |
#' @param object a fitted wordfish model
|
|
232 |
#' @inheritParams predict.textmodel_wordscores
|
|
233 |
#' @importFrom stats predict
|
|
234 |
#' @method predict textmodel_wordfish
|
|
235 |
#' @keywords textmodel internal
|
|
236 |
#' @export
|
|
237 |
predict.textmodel_wordfish <- function(object, |
|
238 |
se.fit = FALSE, |
|
239 |
interval = c("none", "confidence"), level = 0.95, |
|
240 |
...) { |
|
241 |
if (length(list(...)) > 0) stop("Arguments:", names(list(...)), "not supported.\n") |
|
242 | 1 |
interval <- match.arg(interval) |
243 |
|
|
244 | 1 |
fit <- object$theta |
245 | 1 |
names(fit) <- object$docs |
246 |
|
|
247 | 1 |
if (!se.fit && interval == "none") { |
248 | 1 |
class(fit) <- c("predict.textmodel_wordfish", "numeric") |
249 | 1 |
return(fit) |
250 |
}
|
|
251 |
|
|
252 | 1 |
result <- list(fit = fit) |
253 | 1 |
if (se.fit) result$se.fit <- object$se.theta |
254 | 1 |
if (interval == "confidence") { |
255 | 1 |
result$fit <- matrix(result$fit, ncol = 3, nrow = length(result$fit), |
256 | 1 |
dimnames = list(names(result$fit), c("fit", "lwr", "upr"))) |
257 | 1 |
z <- stats::qnorm(1 - (1 - level) / 2) |
258 | 1 |
result$fit[, "lwr"] <- fit - z * object$se.theta |
259 | 1 |
result$fit[, "upr"] <- fit + z * object$se.theta |
260 |
}
|
|
261 | 1 |
class(result) <- c("predict.textmodel_wordscores", class(result)) |
262 | 1 |
result |
263 |
}
|
|
264 |
|
|
265 |
#' print method for a wordfish model
|
|
266 |
#' @param x for print method, the object to be printed
|
|
267 |
#' @param ... unused
|
|
268 |
#' @method print textmodel_wordfish
|
|
269 |
#' @keywords internal textmodel
|
|
270 |
#' @export
|
|
271 |
print.textmodel_wordfish <- function(x, ...) { |
|
272 | 1 |
cat("\nCall:\n") |
273 | 1 |
print(x$call) |
274 | 1 |
cat("\n", |
275 | 1 |
"Dispersion: ", x$dispersion, "; ", |
276 | 1 |
"direction: ", x$dir[1], ' < ' , x$dir[2], "; ", |
277 | 1 |
ndoc(x), " documents; ", |
278 | 1 |
nfeat(x), " features.", |
279 | 1 |
"\n", |
280 | 1 |
sep = "") |
281 |
}
|
|
282 |
|
|
283 |
#' summary method for textmodel_wordfish
|
|
284 |
#' @param object a [textmodel_wordfish] object
|
|
285 |
#' @param n maximum number of features to print in summary
|
|
286 |
#' @param ... unused
|
|
287 |
#' @export
|
|
288 |
#' @method summary textmodel_wordfish
|
|
289 |
#' @keywords internal textmodel
|
|
290 |
summary.textmodel_wordfish <- function(object, n = 30, ...) { |
|
291 |
|
|
292 | 1 |
stat <- data.frame( |
293 | 1 |
theta = object$theta, |
294 | 1 |
se = object$se.theta, |
295 | 1 |
row.names = object$docs, |
296 | 1 |
check.rows = FALSE, |
297 | 1 |
stringsAsFactors = FALSE |
298 |
)
|
|
299 |
|
|
300 | 1 |
result <- list( |
301 | 1 |
'call' = object$call, |
302 | 1 |
'estimated.document.positions' = as.statistics_textmodel(stat), |
303 | 1 |
'estimated.feature.scores' = as.coefficients_textmodel(head(coef(object)$features, n)) |
304 |
)
|
|
305 | 1 |
return(as.summary.textmodel(result)) |
306 |
}
|
|
307 |
|
|
308 |
#' @rdname predict.textmodel_wordfish
|
|
309 |
#' @param margin which margin of parameter estimates to return: both (in a
|
|
310 |
#' list), or just document or feature parameters
|
|
311 |
#' @method coef textmodel_wordfish
|
|
312 |
#' @return `coef.textmodel_wordfish()` returns a matrix of estimated
|
|
313 |
#' parameters coefficients for the specified margin.
|
|
314 |
#' @export
|
|
315 |
coef.textmodel_wordfish <- function(object, margin = c("both", "documents", "features"), ...) { |
|
316 | 1 |
margin <- match.arg(margin) |
317 | 1 |
result <- list( |
318 | 1 |
documents = matrix(cbind(object$theta, object$alpha), ncol = 2, |
319 | 1 |
dimnames = list(docnames(object), c("theta", "alpha"))), |
320 | 1 |
features = matrix(cbind(object$beta, object$psi), ncol = 2, |
321 | 1 |
dimnames = list(featnames(object), c("beta", "psi"))) |
322 |
)
|
|
323 | 1 |
if (margin == "documents") { |
324 | 1 |
result[["documents"]] |
325 | 1 |
} else if (margin == "features") { |
326 | 1 |
result[["features"]] |
327 | 1 |
} else result |
328 |
}
|
|
329 |
|
|
330 |
#' @export
|
|
331 |
#' @rdname predict.textmodel_wordfish
|
|
332 |
coefficients.textmodel_wordfish <- function(object, ...) { |
|
333 |
UseMethod("coef") |
|
334 |
}
|
|
335 |
|
|
336 |
#' @export
|
|
337 |
#' @method print predict.textmodel_wordfish
|
|
338 |
print.predict.textmodel_wordfish <- function(x, ...) { |
|
339 |
print(unclass(x)) |
|
340 |
}
|
Read our documentation on viewing source code .