quanteda / quanteda.textmodels
Showing 2 of 12 files from the diff.

@@ -85,3 +85,19 @@
Loading
85 85
catm <- function(..., sep = " ", appendLF = FALSE) {
86 86
    message(paste(..., sep = sep), appendLF = appendLF)
87 87
}
88 +
89 +
90 +
## make cols add up to one
91 +
colNorm <- function(x) {
92 +
    x / outer(rep(1, nrow(x)), colSums(x))
93 +
}
94 +
95 +
## fast way to group by class for sparse matrix
96 +
## outputs a dense matrix
97 +
group_classes <- function(x, y, smooth = 0) {
98 +
    levels <- levels(as.factor(y))
99 +
    x <- lapply(levels, function(lev) Matrix::colSums(x[y == lev, , drop = FALSE]) + smooth)
100 +
    names(x) <- levels
101 +
    do.call("rbind", x)
102 +
}
103 +

@@ -11,26 +11,22 @@
Loading
11 11
#'   feature frequency totals by training class
12 12
#' @param prior prior distribution on texts; one of `"uniform"`,
13 13
#'   `"docfreq"`, or `"termfreq"`.  See Prior Distributions below.
14 -
#' @param distribution count model for text features, can be `multinomial`
15 -
#'   or `Bernoulli`.  To fit a "binary multinomial" model, first convert
16 -
#'   the dfm to a binary matrix using `[quanteda::dfm_weight](x, scheme = "boolean")`.
14 +
#' @param distribution count model for text features, can be `multinomial` or
15 +
#'   `Bernoulli`.  To fit a "binary multinomial" model, first convert the dfm to
16 +
#'   a binary matrix using `[quanteda::dfm_weight](x, scheme = "boolean")`.
17 17
#' @return
18 18
#' `textmodel_nb()` returns a list consisting of the following (where
19 19
#' \eqn{I} is the total number of documents, \eqn{J} is the total number of
20 20
#' features, and \eqn{k} is the total number of training classes):
21 21
#' @return \item{call}{original function call}
22 -
#' @return \item{PwGc}{\eqn{k \times J}; probability of the word given the class
23 -
#'   (empirical likelihood)}
24 -
#' @return \item{Pc}{\eqn{k}-length named numeric vector of class prior
25 -
#'   probabilities}
26 -
#' @return \item{PcGw}{\eqn{k \times J}; posterior class probability given the
27 -
#'   word}
28 -
#' @return \item{Pw}{\eqn{J \times 1}; baseline probability of the word}
29 -
#' @return \item{x}{the \eqn{I \times J} training dfm `x`}
30 -
#' @return \item{y}{the \eqn{I}-length `y` training class vector}
31 -
#' @return \item{distribution}{the distribution argument}
32 -
#' @return \item{prior}{the prior argument}
33 -
#' @return \item{smooth}{the value of the smoothing parameter}
22 +
#' @return \item{param}{\eqn{k \times V}; class conditional posterior estimates}
23 +
#' @return \item{x}{the \eqn{N \times V} training dfm `x`}
24 +
#' @return \item{y}{the \eqn{N}-length `y` training class vector, where NAs will
25 +
#'   not be used will be retained in the saved `x` matrix}
26 +
#' @return \item{distribution}{character; the distribution of `x` for the NB
27 +
#'   model}
28 +
#' @return \item{priors}{numeric; the class prior probabilities}
29 +
#' @return \item{smooth}{numeric; the value of the smoothing parameter}
34 30
#' @section Prior distributions:
35 31
#'
36 32
#'   Prior distributions refer to the prior probabilities assigned to the
@@ -60,18 +56,19 @@
Loading
60 56
#'   The `smooth` value is added to the feature frequencies, aggregated by
61 57
#'   training class, to avoid zero frequencies in any class.  This has the
62 58
#'   effect of giving more weight to infrequent term occurrences.
63 -
#' @references Manning, C.D., Raghavan, P., & Schütze, H. (2008).
64 -
#'   *An Introduction to Information Retrieval*. Cambridge: Cambridge University Press
65 -
#'   (Chapter 13). Available at <https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>.
59 +
#' @references Manning, C.D., Raghavan, P., & Schütze, H. (2008). *An
60 +
#'   Introduction to Information Retrieval*. Cambridge: Cambridge University
61 +
#'   Press (Chapter 13). Available at
62 +
#'   <https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>.
66 63
#'
67 -
#'   Jurafsky, D. & Martin, J.H. (2018).
68 -
#'   From *Speech and Language Processing: An Introduction to Natural Language
69 -
#'   Processing, Computational Linguistics, and Speech Recognition*. Draft of September 23, 2018
70 -
#'   (Chapter 6, Naive Bayes). Available at <https://web.stanford.edu/~jurafsky/slp3/>.
64 +
#'   Jurafsky, D. & Martin, J.H. (2018). From *Speech and Language Processing:
65 +
#'   An Introduction to Natural Language Processing, Computational Linguistics,
66 +
#'   and Speech Recognition*. Draft of September 23, 2018 (Chapter 6, Naive
67 +
#'   Bayes). Available at <https://web.stanford.edu/~jurafsky/slp3/>.
71 68
#'
72 69
#' @seealso [predict.textmodel_nb()]
73 70
#' @author Kenneth Benoit
74 -
#' @importFrom quanteda dfm_weight dfm_group dfm_smooth as.dfm
71 +
#' @importFrom quanteda dfm_weight as.dfm
75 72
#' @examples
76 73
#' ## Example from 13.1 of _An Introduction to Information Retrieval_
77 74
#' txt <- c(d1 = "Chinese Beijing Chinese",
@@ -79,23 +76,24 @@
Loading
79 76
#'          d3 = "Chinese Macao",
80 77
#'          d4 = "Tokyo Japan Chinese",
81 78
#'          d5 = "Chinese Chinese Chinese Tokyo Japan")
82 -
#' trainingset <- quanteda::dfm(txt, tolower = FALSE)
83 -
#' trainingclass <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)
79 +
#' x <- quanteda::dfm(txt, tolower = FALSE)
80 +
#' y <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)
84 81
#'
85 82
#' ## replicate IIR p261 prediction for test set (document 5)
86 -
#' (tmod1 <- textmodel_nb(trainingset, y = trainingclass, prior = "docfreq"))
83 +
#' (tmod1 <- textmodel_nb(x, y, prior = "docfreq"))
87 84
#' summary(tmod1)
88 85
#' coef(tmod1)
86 +
#' predict(tmod1, type = "prob")
89 87
#' predict(tmod1)
90 88
#'
91 89
#' # contrast with other priors
92 -
#' predict(textmodel_nb(trainingset, y = trainingclass, prior = "uniform"))
93 -
#' predict(textmodel_nb(trainingset, y = trainingclass, prior = "termfreq"))
90 +
#' predict(textmodel_nb(x, y, prior = "uniform"))
91 +
#' predict(textmodel_nb(x, y, prior = "termfreq"))
94 92
#'
95 93
#' ## replicate IIR p264 Bernoulli Naive Bayes
96 -
#' tmod2 <- textmodel_nb(trainingset, y = trainingclass, distribution = "Bernoulli",
97 -
#'                         prior = "docfreq")
98 -
#' predict(tmod2, newdata = trainingset[5, ])
94 +
#' tmod2 <- textmodel_nb(x, y, distribution = "Bernoulli", prior = "docfreq")
95 +
#' predict(tmod2, newdata = x[5, ], type = "prob")
96 +
#' predict(tmod2, newdata = x[5, ])
99 97
#' @export
100 98
textmodel_nb <- function(x, y, smooth = 1,
101 99
                         prior = c("uniform", "docfreq", "termfreq"),
@@ -111,72 +109,72 @@
Loading
111 109
}
112 110
113 111
#' @export
114 -
#' @importFrom quanteda colSums rowSums
112 +
#' @importFrom quanteda colSums rowSums dfm_weight
115 113
textmodel_nb.dfm <- function(x, y, smooth = 1,
116 114
                             prior = c("uniform", "docfreq", "termfreq"),
117 115
                             distribution = c("multinomial", "Bernoulli")) {
118 116
    x <- as.dfm(x)
117 +
    y <- as.factor(y)
119 118
    if (!sum(x)) stop(message_error("dfm_empty"))
120 119
121 120
    prior <- match.arg(prior)
122 121
    distribution <- match.arg(distribution)
123 -
    call <- match.call()
124 122
125 -
    y <- as.factor(y)
126 -
    if (stats::var(as.numeric(y), na.rm = TRUE) == 0) stop("y cannot be constant")
123 +
    if (stats::var(as.numeric(y), na.rm = TRUE) == 0)
124 +
        stop("y cannot be constant")
127 125
128 -
    temp <- x[!is.na(y),]
129 -
    class <- y[!is.na(y)]
126 +
    result <- list(
127 +
        call = match.call(),
128 +
        x = x, y = y,
129 +
        distribution = distribution,
130 +
        smooth = smooth
131 +
    )
132 +
133 +
    if (anyNA(y)) {
134 +
        x <- x[!is.na(y), ]
135 +
        y <- y[!is.na(y)]
136 +
    }
130 137
131 -
    ## distribution
132 -
    if (distribution == "Bernoulli")
133 -
        temp <- dfm_weight(temp, "boolean", force = TRUE)
138 +
    if (distribution == "Bernoulli") {
139 +
        x <- dfm_weight(x, "boolean", force = TRUE)
140 +
    }
134 141
135 -
    temp <- dfm_group(temp, class, force = TRUE)
142 +
    # x <- dfm_group(x, groups = y, force = TRUE)
143 +
    x <- group_classes(x, y,
144 +
                       smooth = if (distribution == "multinomial") smooth else 0)
136 145
137 -
    freq <- rowSums(as.matrix(table(class)))
146 +
    freq <- rowSums(as.matrix(table(y)))
138 147
    if (prior == "uniform") {
139 -
        Pc <- rep(1 / ndoc(temp), ndoc(temp))
140 -
        names(Pc) <- docnames(temp)
148 +
        Pc <- rep(1 / nrow(x), nrow(x))
149 +
        names(Pc) <- rownames(x)
141 150
    } else if (prior == "docfreq") {
142 151
        Pc <- freq
143 152
        Pc <- Pc / sum(Pc)
144 153
    } else if (prior == "termfreq") {
145 -
        Pc <- rowSums(temp)
154 +
        Pc <- rowSums(x)
146 155
        Pc <- Pc / sum(Pc)
147 156
    }
148 157
149 158
    if (distribution == "multinomial") {
150 -
        PwGc <- dfm_weight(dfm_smooth(temp, smooth), scheme = "prop", force = TRUE)
159 +
        # PwGc <- dfm_weight(dfm_smooth(x, smooth), scheme = "prop", force = TRUE)
160 +
        # PwGc <- (x + smooth) / (rowSums(x) + ncol(x) * smooth)
161 +
        PwGc <- x / rowSums(x)
151 162
    } else if (distribution == "Bernoulli") {
152 -
        # denominator here is same as IIR Fig 13.3 line 8 - see also Eq. 13.7
153 -
        PwGc <- (temp + smooth) / (freq + smooth * ndoc(temp))
163 +
        PwGc <- (x + smooth) / (freq + 2 * smooth)
154 164
        PwGc <- as(PwGc, "dgeMatrix")
155 165
    }
156 166
157 -
    # order Pc so that these are the same order as rows of PwGc
158 -
    Pc <- Pc[rownames(PwGc)]
167 +
    Pc <- Pc[rownames(PwGc)]      # make sure Pc order matches the rows of PwGc
168 +
    PwGc <- as.matrix(PwGc)       # convert to ordinary matrix
169 +
    # names(dimnames(PwGc)) <- NULL # remove dimname labels
159 170
160 -
    ## posterior: class x words, cols sum to 1
161 -
    PcGw <- colNorm(PwGc * base::outer(Pc, rep(1, ncol(PwGc))))
171 +
    ## other quantities no longer computed
172 +
    # PcGw <- colNorm(PwGc * base::outer(Pc, rep(1, ncol(PwGc))))
173 +
    # names(dimnames(PcGw))[1] <- names(dimnames(PwGc))[1] <- "classes"
174 +
    # Pw <- t(PwGc) %*% as.numeric(Pc)
162 175
163 -
    # rename row dimensions
164 -
    names(dimnames(PcGw))[1] <- names(dimnames(PwGc))[1] <- "classes"
165 -
166 -
    ## P(w)
167 -
    Pw <- t(PwGc) %*% as.numeric(Pc)
168 -
169 -
    result <- list(
170 -
        PwGc = as.matrix(PwGc),
171 -
        Pc = Pc,
172 -
        PcGw = as.matrix(PcGw),
173 -
        Pw = as.matrix(Pw),
174 -
        x = x, y = y,
175 -
        distribution = distribution,
176 -
        prior = prior,
177 -
        smooth = smooth,
178 -
        call = call
179 -
    )
176 +
    result[["priors"]] <- Pc
177 +
    result[["param"]] <- PwGc
180 178
    class(result) <- c("textmodel_nb", "textmodel", "list")
181 179
    result
182 180
}
@@ -209,78 +207,57 @@
Loading
209 207
                                 type = c("class", "probability", "logposterior"),
210 208
                                 force = FALSE, ...) {
211 209
    unused_dots(...)
212 -
213 210
    type <- match.arg(type)
214 211
215 -
    if (!is.null(newdata)) {
216 -
        data <- as.dfm(newdata)
217 -
    } else {
218 -
        data <- as.dfm(object$x)
219 -
    }
220 -
221 -
    # # remove any words with zero probabilities (this should be done in fitting)
222 -
    # is_zero <- colSums(object$PwGc) == 0
223 -
    # if (any(is_zero)) {
224 -
    #     object$PwGc <- object$PwGc[,!is_zero,drop = FALSE]
225 -
    #     object$PcGw <- object$PcGw[,!is_zero,drop = FALSE]
226 -
    #     object$Pw <- object$Pw[!is_zero,,drop = FALSE]
227 -
    # }
228 -
    data <- force_conformance(data, colnames(object$PwGc), force)
212 +
    newdata <- if (!is.null(newdata)) as.dfm(newdata) else as.dfm(object$x)
213 +
    newdata <- force_conformance(newdata, colnames(object$param), force)
229 214
230 215
    if (object$distribution == "multinomial") {
231 216
232 217
        # log P(d|c) class conditional document likelihoods
233 -
        log_lik <- data %*% t(log(object$PwGc))
234 -
        # weight by class priors
235 -
        logpos <- t(apply(log_lik, 1, "+", log(object$Pc)))
218 +
        loglik <- Matrix::tcrossprod(newdata, log(object$param))
236 219
237 220
    } else if (object$distribution == "Bernoulli") {
238 221
239 -
        data <- dfm_weight(data, "boolean")
240 -
        Nc <- length(object$Pc)
241 -
242 -
        # initialize log posteriors with class priors
243 -
        logpos <- matrix(log(object$Pc), byrow = TRUE,
244 -
                            ncol = Nc, nrow = nrow(data),
245 -
                            dimnames = list(rownames(data), names(object$Pc)))
246 -
        # APPLYBERNOULLINB from IIR Fig 13.3
247 -
        for (c in seq_len(Nc)) {
248 -
            tmp1 <- log(t(data) * object$PwGc[c, ])
249 -
            tmp1[is.infinite(tmp1)] <- 0
250 -
            tmp0 <- log(t(!data) * (1 - object$PwGc[c, ]))
251 -
            tmp0[is.infinite(tmp0)] <- 0
252 -
            logpos[, c] <- logpos[, c] + colSums(tmp0) + colSums(tmp1)
253 -
        }
254 -
    }
222 +
        newdata <- dfm_weight(newdata, "boolean", force = TRUE)
255 223
256 -
    # predict MAP class
257 -
    nb.predicted <- colnames(logpos)[apply(logpos, 1, which.max)]
224 +
        present <- log(object$param)
225 +
        nonpresent <- log(1 - object$param)
226 +
        threshold <- .Machine$double.eps
227 +
        present[is.infinite(present)] <- max(-100000, log(threshold))
228 +
        nonpresent[is.infinite(nonpresent)] <- max(-100000, log(threshold))
258 229
230 +
        presence_prob <- Matrix::tcrossprod(newdata, present)
231 +
        nonpresence_prob <- matrix(base::colSums(t(nonpresent)),
232 +
                                   nrow = nrow(presence_prob),
233 +
                                   ncol = ncol(presence_prob), byrow = TRUE) -
234 +
            Matrix::tcrossprod(newdata, nonpresent)
235 +
        loglik <- presence_prob + nonpresence_prob
236 +
237 +
    }
238 +
239 +
    # weight by class priors
240 +
    logpos <- t(t(loglik) + log(object$priors))
259 241
260 242
    if (type == "class") {
261 -
        names(nb.predicted) <- docnames(data)
262 -
        return(factor(nb.predicted, levels = names(object$Pc)))
243 +
244 +
        classpred <- colnames(logpos)[max.col(logpos, ties.method = "first")]
245 +
        names(classpred) <- rownames(newdata)
246 +
        result <- factor(classpred, levels = names(object$priors))
247 +
263 248
    } else if (type == "probability") {
264 249
265 -
        ## compute class posterior probabilities
266 -
        post_prob <- matrix(NA, ncol = ncol(logpos), nrow = nrow(logpos),
250 +
        result <- matrix(NA, ncol = ncol(logpos), nrow = nrow(logpos),
267 251
                            dimnames = dimnames(logpos))
268 -
269 -
        # compute posterior probabilities
270 252
        for (j in seq_len(ncol(logpos))) {
271 253
            base_lpl <- logpos[, j]
272 -
            post_prob[, j] <- 1 / (1 + rowSums(exp(logpos[, -j, drop = FALSE] - base_lpl)))
254 +
            result[, j] <- 1 / (1 + rowSums(exp(logpos[, -j, drop = FALSE] - base_lpl)))
273 255
        }
274 256
275 -
        # result <- list(probability = post_prob)
276 -
        result <- post_prob
277 -
278 257
    } else if (type == "logposterior") {
279 -
280 -
        # result <- list(logposterior = logpos)
281 -
        result <- logpos
258 +
        result <- as.matrix(logpos)
282 259
    }
283 -
    # class(result) <- c("predict.textmodel_nb", "list")
260 +
284 261
    result
285 262
}
286 263
@@ -290,13 +267,12 @@
Loading
290 267
    cat("\nCall:\n")
291 268
    print(x$call)
292 269
    cat("\n",
293 -
        "Distribution: ", x$distribution, "; ",
294 -
        "prior: ", x$prior, "; ",
295 -
        "smoothing value: ", x$smooth, "; ",
296 -
        length(na.omit(x$y)), " training documents; ",
297 -
        nfeat(na.omit(x)), " fitted features.",
298 -
        "\n",
299 -
        sep = "")
270 +
        "Distribution:", x$distribution, ";",
271 +
        "priors:", x$prior, ";",
272 +
        "smoothing value:", x$smooth, ";",
273 +
        length(na.omit(x$y)), "training documents; ",
274 +
        ncol(na.omit(x)), "fitted features.",
275 +
        "\n", sep = " ")
300 276
}
301 277
302 278
#' summary method for textmodel_nb objects
@@ -308,9 +284,9 @@
Loading
308 284
#' @export
309 285
summary.textmodel_nb <- function(object, n = 30, ...) {
310 286
    result <- list(
311 -
        'call' = object$call,
312 -
        'class.priors' = as.coefficients_textmodel(object$Pc),
313 -
        'estimated.feature.scores' = as.coefficients_textmodel(head(coef(object), n))
287 +
        "call" = object$call,
288 +
        "class.priors" = as.coefficients_textmodel(object$priors),
289 +
        "estimated.feature.scores" = as.coefficients_textmodel(head(coef(object), n))
314 290
    )
315 291
    as.summary.textmodel(result)
316 292
}
@@ -319,7 +295,7 @@
Loading
319 295
#' @method coef textmodel_nb
320 296
#' @export
321 297
coef.textmodel_nb <- function(object, ...) {
322 -
    t(object$PcGw)
298 +
    t(object$param)
323 299
}
324 300
325 301
#' @noRd
@@ -328,14 +304,3 @@
Loading
328 304
coefficients.textmodel_nb <- function(object, ...) {
329 305
    UseMethod("coef")
330 306
}
331 -
332 -
## make cols add up to one
333 -
colNorm <- function(x) {
334 -
    x / outer(rep(1, nrow(x)), colSums(x))
335 -
}
336 -
337 -
#' @export
338 -
#' @method print predict.textmodel_nb
339 -
print.predict.textmodel_nb <- function(x, ...) {
340 -
    print(unclass(x))
341 -
}
Files Coverage
R 85.31%
src 94.09%
Project Totals (16 files) 87.54%
Notifications are pending CI completion. Waiting for GitHub's status webhook to queue notifications. Push notifications now.
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading