1
# main methods --------------
2

3
#' Wordfish text model
4
#'
5
#' Estimate Slapin and Proksch's (2008) "wordfish" Poisson scaling model of
6
#' one-dimensional document positions using conditional maximum likelihood.
7
#' @importFrom Rcpp evalCpp
8
#' @param x the dfm on which the model will be fit
9
#' @param dir set global identification by specifying the indexes for a pair of
10
#'   documents such that \eqn{\hat{\theta}_{dir[1]} < \hat{\theta}_{dir[2]}}.
11
#' @param priors prior precisions for the estimated parameters \eqn{\alpha_i},
12
#'   \eqn{\psi_j}, \eqn{\beta_j}, and \eqn{\theta_i}, where \eqn{i} indexes
13
#'   documents and \eqn{j} indexes features
14
#' @param tol tolerances for convergence.  The first value is a convergence
15
#'   threshold for the log-posterior of the model, the second value is the
16
#'   tolerance in the difference in parameter values from the iterative
17
#'   conditional maximum likelihood (from conditionally estimating
18
#'   document-level, then feature-level parameters).
19
#' @param dispersion sets whether a quasi-Poisson quasi-likelihood should be
20
#'   used based on a single dispersion parameter (`"poisson"`), or
21
#'   quasi-Poisson (`"quasipoisson"`)
22
#' @param dispersion_level sets the unit level for the dispersion parameter,
23
#'   options are `"feature"` for term-level variances, or `"overall"`
24
#'   for a single dispersion parameter
25
#' @param dispersion_floor constraint for the minimal underdispersion multiplier
26
#'   in the quasi-Poisson model.  Used to minimize the distorting effect of
27
#'   terms with rare term or document frequencies that appear to be severely
28
#'   underdispersed.  Default is 0, but this only applies if `dispersion =
29
#'   "quasipoisson"`.
30
#' @param sparse specifies whether the `"dfm"` is coerced to dense.  While
31
#'   setting this to `TRUE` will make it possible to handle larger dfm
32
#'   objects (and make execution faster), it will generate slightly different
33
#'   results each time, because the sparse SVD routine has a stochastic element.
34
#' @param abs_err specifies how the convergence is considered
35
#' @param svd_sparse uses svd to initialize the starting values of theta,
36
#'   only applies when `sparse = TRUE`
37
#' @param residual_floor specifies the threshold for residual matrix when
38
#'   calculating the svds, only applies when `sparse = TRUE`
39
#' @return An object of class `textmodel_fitted_wordfish`.  This is a list
40
#'   containing: \item{dir}{global identification of the dimension}
41
#'   \item{theta}{estimated document positions} \item{alpha}{estimated document
42
#'   fixed effects} \item{beta}{estimated feature marginal effects}
43
#'   \item{psi}{estimated word fixed effects} \item{docs}{document labels}
44
#'   \item{features}{feature labels} \item{sigma}{regularization parameter for
45
#'   betas in Poisson form} \item{ll}{log likelihood at convergence}
46
#'   \item{se.theta}{standard errors for theta-hats} \item{x}{dfm to which
47
#'   the model was fit}
48
#' @details The returns match those of Will Lowe's R implementation of
49
#'   `wordfish` (see the austin package), except that here we have renamed
50
#'   `words` to be `features`.  (This return list may change.)  We
51
#'   have also followed the practice begun with Slapin and Proksch's early
52
#'   implementation of the model that used a regularization parameter of
53
#'   se\eqn{(\sigma) = 3}, through the third element in `priors`.
54
#'
55
#' @note In the rare situation where a warning message of "The algorithm did not
56
#'   converge." shows up, removing some documents may work.
57
#' @seealso [predict.textmodel_wordfish()]
58
#' @references Slapin, J. & Proksch, S.O. (2008).
59
#'   [A Scaling Model
60
#'   for Estimating Time-Series Party Positions from Texts](https://doi.org/10.1111/j.1540-5907.2008.00338.x). *American
61
#'   Journal of Political Science*, 52(3), 705--772.
62
#'
63
#'   Lowe, W. & Benoit, K.R. (2013). [Validating
64
#'   Estimates of Latent Traits from Textual Data Using Human Judgment as a Benchmark](http://doi.org/10.1093/pan/mpt002).
65
#'   *Political Analysis*, 21(3), 298--313.
66
#' @author Benjamin Lauderdale, Haiyan Wang, and Kenneth Benoit
67
#' @examples
68
#' (tmod1 <- textmodel_wordfish(data_dfm_lbgexample, dir = c(1,5)))
69
#' summary(tmod1, n = 10)
70
#' coef(tmod1)
71
#' predict(tmod1)
72
#' predict(tmod1, se.fit = TRUE)
73
#' predict(tmod1, interval = "confidence")
74
#'
75
#' \dontrun{
76
#' library("quanteda")
77
#' dfmat <- dfm(data_corpus_irishbudget2010)
78
#' (tmod2 <- textmodel_wordfish(dfmat, dir = c(6,5)))
79
#' (tmod3 <- textmodel_wordfish(dfmat, dir = c(6,5),
80
#'                              dispersion = "quasipoisson", dispersion_floor = 0))
81
#' (tmod4 <- textmodel_wordfish(dfmat, dir = c(6,5),
82
#'                              dispersion = "quasipoisson", dispersion_floor = .5))
83
#' plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
84
#'      xlim = c(0, 1.0), ylim = c(0, 1.0))
85
#' plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
86
#'      xlim = c(0, 1.0), ylim = c(0, 1.0), type = "n")
87
#' underdispersedTerms <- sample(which(tmod3$phi < 1.0), 5)
88
#' which(featnames(dfmat) %in% names(topfeatures(dfmat, 20)))
89
#' text(tmod3$phi, tmod4$phi, tmod3$features,
90
#'      cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "grey90")
91
#' text(tmod3$phi['underdispersedTerms'], tmod4$phi['underdispersedTerms'],
92
#'      tmod3$features['underdispersedTerms'],
93
#'      cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "black")
94
#' if (requireNamespace("austin")) {
95
#'     tmod5 <- austin::wordfish(quanteda::as.wfm(dfmat), dir = c(6, 5))
96
#'     cor(tmod1$theta, tmod5$theta)
97
#' }}
98
#' @importFrom quanteda as.dfm
99
#' @export
100
textmodel_wordfish <- function(x, dir = c(1, 2),
101
                               priors = c(Inf, Inf, 3, 1),
102
                               tol = c(1e-6, 1e-8),
103
                               dispersion = c("poisson", "quasipoisson"),
104
                               dispersion_level = c("feature", "overall"),
105
                               dispersion_floor = 0,
106
                               sparse = FALSE,
107
                               abs_err = FALSE,
108
                               svd_sparse = TRUE,
109
                               residual_floor = 0.5) {
110 1
    UseMethod("textmodel_wordfish")
111
}
112

113
#' @export
114
textmodel_wordfish.default <- function(x, dir = c(1, 2),
115
                                       priors = c(Inf, Inf, 3, 1),
116
                                       tol = c(1e-6, 1e-8),
117
                                       dispersion = c("poisson", "quasipoisson"),
118
                                       dispersion_level = c("feature", "overall"),
119
                                       dispersion_floor = 0,
120
                                       sparse = FALSE,
121
                                       abs_err = FALSE,
122
                                       svd_sparse = TRUE,
123
                                       residual_floor = 0.5) {
124 1
    stop(friendly_class_undefined_message(class(x), "textmodel_wordfish"))
125
}
126

127
#' @export
128
textmodel_wordfish.dfm <- function(x, dir = c(1, 2),
129
                                   priors = c(Inf, Inf, 3, 1),
130
                                   tol = c(1e-6, 1e-8),
131
                                   dispersion = c("poisson", "quasipoisson"),
132
                                   dispersion_level = c("feature", "overall"),
133
                                   dispersion_floor = 0,
134
                                   sparse = FALSE,
135
                                   abs_err = FALSE,
136
                                   svd_sparse = TRUE,
137
                                   residual_floor = 0.5) {
138

139 1
    x <- as.dfm(x)
140 1
    if (!sum(x)) stop(message_error("dfm_empty"))
141 1
    dispersion <- match.arg(dispersion)
142 1
    dispersion_level <- match.arg(dispersion_level)
143

144
    # check that no rows or columns are all zero
145 1
    empty_docs <- which(ntoken(x) == 0)
146 1
    if (length(empty_docs)) {
147 0
        catm("Note: removed the following zero-token documents:",
148 0
             docnames(x)[empty_docs], "\n")
149 0
        x <- x[empty_docs * -1, ]
150
    }
151 1
    empty_feats <- which(docfreq(x) == 0)
152 1
    if (length(empty_feats)) {
153 0
        catm("Note: removed the following zero-count features:",
154 0
             featnames(x)[empty_feats], "\n")
155 0
        x <- x[, empty_feats * -1]
156
    }
157 0
    if (length(empty_docs) || length(empty_feats)) catm("\n")
158

159
    # some error checking
160 1
    if (length(priors) != 4)
161 0
        stop("priors requires 4 elements")
162 1
    if (length(tol) != 2)
163 0
        stop("tol requires 2 elements")
164 1
    if (!is.numeric(priors) || !is.numeric(tol))
165 0
        stop("priors and tol must be numeric")
166 1
    if (dispersion_floor < 0 || dispersion_floor > 1.0)
167 0
        stop("dispersion_floor must be between 0 and 1.0")
168

169 1
    if (dispersion == "poisson" && dispersion_floor != 0)
170 0
        warning("dispersion_floor argument ignored for poisson")
171

172
    # check quasi-poisson settings and translate into numerical values
173
    # 1 = Poisson,
174
    # 2 = quasi-Poisson, overall dispersion,
175
    # 3 = quasi-Poisson, term dispersion,
176
    # 4 = quasi-Poisson, term dispersion w/floor
177 1
    if (dispersion == "poisson") {
178 1
        disp <- 1L
179 1
    } else if (dispersion == "quasipoisson" && dispersion_level == "overall") {
180 1
        disp <- 2L
181 1
    } else if (dispersion == "quasipoisson" && dispersion_level == "feature") {
182 1
        if (dispersion_floor) {
183 0
            disp <- 4L
184
        } else {
185 1
            disp <- 3L
186
        }
187
    } else {
188 0
        stop("Illegal option combination.")
189
    }
190 1
    if (sparse == TRUE) {
191 1
        result <- qatd_cpp_wordfish(x, as.integer(dir), 1 / (priors ^ 2),
192 1
                                    tol, disp,
193 1
                                    dispersion_floor, abs_err, svd_sparse,
194 1
                                    residual_floor)
195
    } else{
196 1
        result <- qatd_cpp_wordfish_dense(as.matrix(x),
197 1
                                          as.integer(dir), 1 / (priors ^ 2),
198 1
                                          tol, disp,
199 1
                                          dispersion_floor, abs_err)
200
    }
201
    # NOTE: psi is a 1 x nfeat matrix, not a numeric vector
202
    #       alpha is a ndoc x 1 matrix, not a numeric vector
203 1
    if (any(is.nan(result$theta)))
204 0
        warning("Warning: The algorithm did not converge.")
205

206 1
    result <- list(
207 1
        x = x,
208 1
        docs = docnames(x),
209 1
        features = featnames(x),
210 1
        dir = dir,
211 1
        dispersion = dispersion,
212 1
        priors = priors,
213 1
        theta = as.numeric(result$theta),
214 1
        beta = as.numeric(result$beta),
215 1
        psi = as.numeric(result$psi),
216 1
        alpha = as.numeric(result$alpha),
217 1
        phi = as.numeric(result$phi),
218 1
        se.theta = as.numeric(result$thetaSE) ,
219 1
        call = match.call()
220
    )
221 1
    class(result) <- c("textmodel_wordfish", "textmodel", "list")
222 1
    result
223
}
224

225
#' Prediction from a textmodel_wordfish method
226
#'
227
#' `predict.textmodel_wordfish()` returns estimated document scores and
228
#' confidence intervals.  The method is provided for consistency with other
229
#' `textmodel_*()` methods, but does not currently allow prediction on
230
#' out-of-sample data.
231
#' @param object a fitted wordfish model
232
#' @inheritParams predict.textmodel_wordscores
233
#' @importFrom stats predict
234
#' @method predict textmodel_wordfish
235
#' @keywords textmodel internal
236
#' @export
237
predict.textmodel_wordfish <- function(object,
238
                                       se.fit = FALSE,
239
                                       interval = c("none", "confidence"), level = 0.95,
240
                                       ...) {
241 0
    if (length(list(...)) > 0) stop("Arguments:", names(list(...)), "not supported.\n")
242 1
    interval <- match.arg(interval)
243

244 1
    fit <- object$theta
245 1
    names(fit) <- object$docs
246

247 1
    if (!se.fit && interval == "none") {
248 1
        class(fit) <- c("predict.textmodel_wordfish", "numeric")
249 1
        return(fit)
250
    }
251

252 1
    result <- list(fit = fit)
253 1
    if (se.fit) result$se.fit <- object$se.theta
254 1
    if (interval == "confidence") {
255 1
        result$fit <- matrix(result$fit, ncol = 3, nrow = length(result$fit),
256 1
                             dimnames = list(names(result$fit), c("fit", "lwr", "upr")))
257 1
        z <- stats::qnorm(1 - (1 - level) / 2)
258 1
        result$fit[, "lwr"] <- fit - z * object$se.theta
259 1
        result$fit[, "upr"] <- fit + z * object$se.theta
260
    }
261 1
    class(result) <- c("predict.textmodel_wordscores", class(result))
262 1
    result
263
}
264

265
#' print method for a wordfish model
266
#' @param x for print method, the object to be printed
267
#' @param ... unused
268
#' @method print textmodel_wordfish
269
#' @keywords internal textmodel
270
#' @export
271
print.textmodel_wordfish <- function(x, ...) {
272 1
    cat("\nCall:\n")
273 1
    print(x$call)
274 1
    cat("\n",
275 1
        "Dispersion: ", x$dispersion, "; ",
276 1
        "direction: ", x$dir[1], ' < ' , x$dir[2], "; ",
277 1
        ndoc(x), " documents; ",
278 1
        nfeat(x), " features.",
279 1
        "\n",
280 1
        sep = "")
281
}
282

283
#' summary method for textmodel_wordfish
284
#' @param object a [textmodel_wordfish] object
285
#' @param n maximum number of features to print in summary
286
#' @param ... unused
287
#' @export
288
#' @method summary textmodel_wordfish
289
#' @keywords internal textmodel
290
summary.textmodel_wordfish <- function(object, n = 30, ...) {
291

292 1
    stat <- data.frame(
293 1
        theta = object$theta,
294 1
        se = object$se.theta,
295 1
        row.names = object$docs,
296 1
        check.rows = FALSE,
297 1
        stringsAsFactors = FALSE
298
    )
299

300 1
    result <- list(
301 1
        'call' = object$call,
302 1
        'estimated.document.positions' = as.statistics_textmodel(stat),
303 1
        'estimated.feature.scores' = as.coefficients_textmodel(head(coef(object)$features, n))
304
    )
305 1
    return(as.summary.textmodel(result))
306
}
307

308
#' @rdname predict.textmodel_wordfish
309
#' @param margin which margin of parameter estimates to return: both (in a
310
#'   list), or just document or feature parameters
311
#' @method coef textmodel_wordfish
312
#' @return `coef.textmodel_wordfish()` returns a matrix of estimated
313
#'   parameters coefficients for the specified margin.
314
#' @export
315
coef.textmodel_wordfish <- function(object, margin = c("both", "documents", "features"), ...) {
316 1
    margin <- match.arg(margin)
317 1
    result <- list(
318 1
        documents = matrix(cbind(object$theta, object$alpha), ncol = 2,
319 1
                           dimnames = list(docnames(object), c("theta", "alpha"))),
320 1
        features = matrix(cbind(object$beta, object$psi), ncol = 2,
321 1
                          dimnames = list(featnames(object), c("beta", "psi")))
322
    )
323 1
    if (margin == "documents") {
324 1
        result[["documents"]]
325 1
    } else if (margin == "features") {
326 1
        result[["features"]]
327 1
    } else result
328
}
329

330
#' @export
331
#' @rdname predict.textmodel_wordfish
332
coefficients.textmodel_wordfish <- function(object, ...) {
333 0
    UseMethod("coef")
334
}
335

336
#' @export
337
#' @method print predict.textmodel_wordfish
338
print.predict.textmodel_wordfish <- function(x, ...) {
339 0
    print(unclass(x))
340
}

Read our documentation on viewing source code .

Loading