1
# main methods --------------
2

3
#' Wordscores text model
4
#'
5
#' `textmodel_wordscores` implements Laver, Benoit and Garry's (2003)
6
#' "Wordscores" method for scaling texts on a single dimension, given a set of
7
#' anchoring or *reference* texts whose values are set through reference
8
#' scores. This scale can be fitted in the linear space (as per LBG 2003) or in
9
#' the logit space (as per Beauchamp 2012).  Estimates of *virgin* or
10
#' unknown texts are obtained using the `predict()` method to score
11
#' documents from a fitted `textmodel_wordscores` object.
12
#'
13
#' @param x the [dfm] on which the model will be trained
14
#' @param y vector of training scores associated with each document
15
#'   in `x`
16
#' @param smooth a smoothing parameter for word counts; defaults to zero to
17
#'   match the LBG (2003) method. See Value below for additional information on
18
#'   the behaviour of this argument.
19
#' @param scale scale on which to score the words; `"linear"` for classic
20
#'   LBG linear posterior weighted word class differences, or `"logit"`
21
#'   for log posterior differences
22
#' @details The `textmodel_wordscores()` function and the associated
23
#'   [`predict()`][predict.textmodel_wordscores] method are designed
24
#'   to function in the same manner as [stats::predict.lm()].
25
#'   `coef()` can also be used to extract the word coefficients from the
26
#'   fitted `textmodel_wordscores` object, and `summary()` will print
27
#'   a nice summary of the fitted object.
28
#' @return  A fitted `textmodel_wordscores` object.  This object will
29
#'   contain a copy of the input data, but in its original form without any
30
#'   smoothing applied. Calling [predict.textmodel_wordscores()] on
31
#'   this object without specifying a value for `newdata`, for instance,
32
#'   will predict on the unsmoothed object.  This behaviour differs from
33
#'   versions of \pkg{quanteda} <= 1.2.
34
#' @seealso [predict.textmodel_wordscores()] for methods of applying a
35
#'   fitted [textmodel_wordscores] model object to predict quantities from
36
#'   (other) documents.
37
#' @author Kenneth Benoit
38
#' @examples
39
#' (tmod <- textmodel_wordscores(data_dfm_lbgexample, y = c(seq(-1.5, 1.5, .75), NA)))
40
#' summary(tmod)
41
#' coef(tmod)
42
#' predict(tmod)
43
#' predict(tmod, rescaling = "lbg")
44
#' predict(tmod, se.fit = TRUE, interval = "confidence", rescaling = "mv")
45
#' @references Laver, M., Benoit, K.R., & Garry, J. (2003).
46
#'   [Estimating
47
#'   Policy Positions from Political Text using Words as Data](https://kenbenoit.net/pdfs/WORDSCORESAPSR.pdf). *American
48
#'   Political Science Review*, 97(2), 311--331.
49
#'
50
#'   Beauchamp, N. (2012). [Using
51
#'   Text to Scale Legislatures with Uninformative Voting](http://nickbeauchamp.com/work/Beauchamp_scaling_current.pdf). New York University Mimeo.
52
#'
53
#'   Martin, L.W. & Vanberg, G. (2007). [A Robust
54
#'   Transformation Procedure for Interpreting Political Text](https://doi.org/10.1093/pan/mpm010). *Political Analysis*
55
#'   16(1), 93--100.
56
#' @importFrom quanteda dfm_weight dfm_smooth as.dfm
57
#' @export
58
textmodel_wordscores <- function(x, y, scale = c("linear", "logit"), smooth = 0) {
59 1
    UseMethod("textmodel_wordscores")
60
}
61

62
#' @export
63
textmodel_wordscores.default <- function(x, y, scale = c("linear", "logit"), smooth = 0) {
64 1
    stop(friendly_class_undefined_message(class(x), "textmodel_wordscores"))
65
}
66

67
#' @export
68
textmodel_wordscores.dfm <- function(x, y, scale = c("linear", "logit"), smooth = 0) {
69 1
    x <- as.dfm(x)
70 1
    if (!sum(x)) stop(message_error("dfm_empty"))
71 1
    scale <- match.arg(scale)
72 1
    call <- match.call()
73

74 1
    if (nrow(x) < 2)
75 0
        stop("wordscores model requires at least two training documents.")
76 1
    if (nrow(x) != length(y))
77 0
        stop("trainingdata and scores vector must refer to same number of documents.")
78 1
    if (!is.numeric(y))
79 0
        stop("wordscores model requires numeric scores.")
80

81 1
    temp <- x[!is.na(y),]
82 1
    ref <- y[!is.na(y)]
83

84 1
    if (smooth)
85 1
        temp <- dfm_smooth(temp, smooth)
86

87 1
    tFwr <- t(dfm_weight(temp, "prop", force = TRUE))
88 1
    Pwr <- tFwr / rowSums(tFwr)    # posterior word probability Pwr
89
    # compute likelihoods "Pwr" Pr(this word | document)
90 1
    if (scale == "linear") {
91 1
        Sw <- Pwr %*% ref
92 1
        Sw <- Sw[,1]
93 0
    } else if (scale == "logit") {
94 0
        if (length(y) > 2)
95 0
            stop("\nFor logit scale, only two training texts can be used.")
96 0
        if (!identical(c(-1,1), range(y))) {
97 0
            warning("\nFor logit scale, training scores are automatically rescaled to -1 and 1.")
98 0
            y <- rescaler(y)
99
        }
100 0
        if (y[1] > y[2]) {
101 0
            Sw <- log(Pwr[, 1]) - log(Pwr[, 2])
102
        } else {
103 0
            Sw <- log(Pwr[, 2]) - log(Pwr[, 1])
104
        }
105
    }
106

107 1
    result <- list(
108 1
        wordscores = Sw[!is.na(Sw)],
109 1
        x = x,
110 1
        y = y,
111 1
        scale = scale,
112 1
        call = call
113
    )
114 1
    class(result) <- c("textmodel_wordscores", "textmodel", "list")
115 1
    result
116
}
117

118
#' Predict textmodel_wordscores
119
#' @param object a fitted Wordscores textmodel
120
#' @param newdata dfm on which prediction should be made
121
#' @param se.fit if `TRUE`, return standard errors as well
122
#' @param rescaling `"none"` for "raw" scores; `"lbg"` for LBG (2003)
123
#'   rescaling; or `"mv"` for the rescaling proposed by Martin and Vanberg
124
#'   (2007).  See References.
125
#' @param interval type of confidence interval calculation
126
#' @param level tolerance/confidence level for intervals
127
#' @param force make the feature set of `newdata` conform to the model
128
#'   terms.  The default of `TRUE` means that a fitted model can be applied
129
#'   to scale a dfm that does not contain a 1:1 match of features in the
130
#'   training and prediction data.
131
#' @param ... not used
132
#' @return
133
#' `predict.textmodel_wordscores()` returns a named vector of predicted
134
#' document scores ("text scores" \eqn{S_{vd}} in LBG 2003), or a named list if
135
#' `se.fit = TRUE` consisting of the predicted scores (`$fit`) and the
136
#' associated standard errors (`$se.fit`). When `interval =
137
#' "confidence"`, the predicted values will be a matrix.  This behaviour matches
138
#' that of [stats::predict.lm()].
139
#' @examples
140
#' tmod <- textmodel_wordscores(data_dfm_lbgexample, c(seq(-1.5, 1.5, .75), NA))
141
#' predict(tmod)
142
#' predict(tmod, rescaling = "mv")
143
#' predict(tmod, rescaling = "lbg")
144
#' predict(tmod, se.fit = TRUE)
145
#' predict(tmod, se.fit = TRUE, interval = "confidence")
146
#' predict(tmod, se.fit = TRUE, interval = "confidence", rescaling = "lbg")
147
#' @keywords textmodel internal
148
#' @importFrom stats predict
149
#' @method predict textmodel_wordscores
150
#' @export
151
#' @importFrom stats qnorm median sd
152
predict.textmodel_wordscores <- function(object,
153
                                         newdata = NULL,
154
                                         se.fit = FALSE,
155
                                         interval = c("none", "confidence"), level = 0.95,
156
                                         rescaling = c("none", "lbg", "mv"),
157
                                         force = TRUE,
158
                                         ...) {
159

160 1
    unused_dots(...)
161

162 1
    interval <- match.arg(interval)
163 1
    rescaling <- match.arg(rescaling)
164

165 1
    if (!is.null(newdata)) {
166 1
        data <- as.dfm(newdata)
167
    } else {
168 1
        data <- as.dfm(object$x)
169
    }
170

171
    # Compute text scores as weighted mean of word scores in "virgin" document
172 1
    sw <- coef(object)
173 1
    data <- force_conformance(data, names(sw), force)
174

175
    # This is different from computing term weights on only the scorable words.
176
    # It take rowSums() only to generates named vector.
177 1
    raw <- rowSums(dfm_weight(data, "prop", force = TRUE) %*% sw)
178

179
    # if (verbose)
180
    #    catm(sprintf('%d of %d features (%.2f%%) can be scored\n\n',
181
    #         length(sw), nfeat(data), 100 * length(sw) / nfeat(data)))
182

183 1
    if (rescaling == "mv") {
184 1
        if (sum(!is.na(object$y)) > 2)
185 1
            warning("More than two reference scores found with MV rescaling; using only min, max values.")
186 1
        fit <- mv_transform(raw, object$y, raw)
187 1
    } else if (rescaling == "lbg") {
188 1
        lbg_sdr <- stats::sd(object$y, na.rm = TRUE)
189 1
        lbg_sv <- mean(raw, na.rm = TRUE)
190 1
        lbg_sdv <- if (length(raw) < 2L) 0 else stats::sd(raw)
191 1
        lbg_mult <- if (lbg_sdr == 0) 0 else lbg_sdr / lbg_sdv
192 1
        fit <- (raw - lbg_sv) * lbg_mult + lbg_sv
193
    } else {
194 1
        fit <- raw
195
    }
196

197 1
    if (!se.fit && interval == "none") {
198 1
        class(fit) <- c("predict.textmodel_wordscores", "numeric")
199 1
        return(fit)
200
    }
201

202
    # Compute standard error
203 1
    raw_se <- rep(NA, length(raw))
204 1
    fwv <- dfm_weight(data, "prop", force = TRUE)
205 1
    for (i in seq_along(raw_se))
206 1
        raw_se[i] <- sqrt(sum(as.numeric(fwv[i,]) * (raw[i] - sw) ^ 2)) / sqrt(rowSums(data)[[i]])
207

208 1
    result <- list(fit = fit)
209 1
    if (se.fit) {
210 1
        if (rescaling == "mv") {
211 1
            z <- stats::qnorm(1 - (1 - level) / 2)
212 1
            upr <- mv_transform(raw + z * raw_se, object$y, raw)
213 1
            result$se.fit <- (upr - result$fit) / z
214 1
        } else if (rescaling == "lbg") {
215 1
            result$se.fit <- (raw_se - lbg_sv) * lbg_mult + lbg_sv
216
        } else {
217 1
            result$se.fit <- raw_se
218
        }
219
    }
220

221 1
    if (interval == "confidence") {
222
        # make fit into a matrix
223 1
        result$fit <- matrix(result$fit, ncol = 3, nrow = length(result$fit),
224 1
                             dimnames = list(names(result$fit), c("fit", "lwr", "upr")))
225

226
        # Compute confidence intervals
227 1
        z <- stats::qnorm(1 - (1 - level) / 2)
228 1
        raw <- unname(raw)
229

230 1
        if (rescaling == "mv") {
231 1
            result$fit[, "lwr"] <- mv_transform(raw - z * raw_se, object$y, raw)
232 1
            result$fit[, "upr"] <- mv_transform(raw + z * raw_se, object$y, raw)
233 1
        } else if (rescaling == "lbg") {
234 1
            if (lbg_mult == 0) {
235 0
                result$fit[, "lwr"] <- raw - z * raw_se
236 0
                result$fit[, "upr"] <- raw + z * raw_se
237
            } else {
238 1
                result$fit[, "lwr"] <- ((raw - z * raw_se) - lbg_sv) * lbg_mult + lbg_sv
239 1
                result$fit[, "upr"] <- ((raw + z * raw_se) - lbg_sv) * lbg_mult + lbg_sv
240
            }
241
        } else {
242 1
            result$fit[, "lwr"] <- raw - z * raw_se
243 1
            result$fit[, "upr"] <- raw + z * raw_se
244
        }
245
    }
246

247 1
    class(result) <- c("predict.textmodel_wordscores", class(result))
248 1
    result
249
}
250

251
# internal methods -----------
252

253
## Rescale a vector so that the endpoints match scale.min, scale.max
254
rescaler <- function(x, scale.min = -1, scale.max = 1) {
255 0
    scale.width <- scale.max - scale.min
256 0
    scale.factor <- scale.width / (max(x) - min(x))
257 0
    return((x - min(x)) * scale.factor - scale.max)
258
}
259

260
## Internal function for MV rescaling
261
mv_transform <- function(x, y, z) {
262 1
    i_low <- which(y == min(y, na.rm = TRUE))
263 1
    i_high <- which(y == max(y, na.rm = TRUE))
264 1
    return((x - z[i_low]) * (max(y, na.rm = TRUE) - min(y, na.rm = TRUE)) /
265 1
               (z[i_high] - z[i_low]) + min(y, na.rm = TRUE))
266
}
267

268
# redefined generic methods -----------
269

270
#' @export
271
#' @method print textmodel_wordscores
272
#' @noRd
273
print.textmodel_wordscores <- function(x, ...) {
274 1
    cat("\nCall:\n")
275 1
    print(x$call)
276 1
    cat("\n",
277 1
        "Scale: ", x$scale, "; ",
278 1
        length(na.omit(x$y)), " reference scores; ",
279 1
        length(na.omit(x$wordscores)), " scored features.",
280 1
        "\n",
281 1
        sep = "")
282
}
283

284
#' @export
285
#' @noRd
286
#' @method summary textmodel_wordscores
287
summary.textmodel_wordscores <- function(object, n = 30L, ...) {
288

289 1
    stat <- data.frame(
290 1
        score = object$y,
291 1
        total = apply(object$x, 1, sum),
292 1
        min = apply(object$x, 1, min),
293 1
        max = apply(object$x, 1, max),
294 1
        mean = apply(object$x, 1, mean),
295 1
        median = apply(object$x, 1, stats::median),
296 1
        row.names = docnames(object$x),
297 1
        check.rows = FALSE,
298 1
        stringsAsFactors = FALSE
299
    )
300 1
    result <- list(
301 1
        'call' = object$call,
302 1
        'reference.document.statistics' = as.statistics_textmodel(stat),
303 1
        'wordscores' = as.coefficients_textmodel(head(coef(object), n))
304
    )
305 1
    as.summary.textmodel(result)
306
}
307

308

309
#' @noRd
310
#' @method coef textmodel_wordscores
311
#' @export
312
coef.textmodel_wordscores <- function(object, ...) {
313 1
    object$wordscores
314
}
315

316
#' @noRd
317
#' @method coefficients textmodel_wordscores
318
#' @export
319
coefficients.textmodel_wordscores <- function(object, ...) {
320 0
    UseMethod("coef")
321
}
322

323
#' @export
324
#' @method print predict.textmodel_wordscores
325
print.predict.textmodel_wordscores <- function(x, ...) {
326 0
    print(unclass(x))
327
}

Read our documentation on viewing source code .

Loading