1
#' Naive Bayes classifier for texts
2
#'
3
#' Fit a multinomial or Bernoulli Naive Bayes model, given a dfm and some
4
#' training labels.
5
#' @param x the [dfm] on which the model will be fit.  Does not need to
6
#'   contain only the training documents.
7
#' @param y vector of training labels associated with each document identified
8
#'   in `train`.  (These will be converted to factors if not already
9
#'   factors.)
10
#' @param smooth smoothing parameter for feature counts, added to the
11
#'   feature frequency totals by training class
12
#' @param prior prior distribution on texts; one of `"uniform"`,
13
#'   `"docfreq"`, or `"termfreq"`.  See Prior Distributions below.
14
#' @param distribution count model for text features, can be `multinomial` or
15
#'   `Bernoulli`.  To fit a "binary multinomial" model, first convert the dfm to
16
#'   a binary matrix using `[quanteda::dfm_weight](x, scheme = "boolean")`.
17
#' @return
18
#' `textmodel_nb()` returns a list consisting of the following (where
19
#' \eqn{I} is the total number of documents, \eqn{J} is the total number of
20
#' features, and \eqn{k} is the total number of training classes):
21
#' @return \item{call}{original function call}
22
#' @return \item{param}{\eqn{k \times V}; class conditional posterior estimates}
23
#' @return \item{x}{the \eqn{N \times V} training dfm `x`}
24
#' @return \item{y}{the \eqn{N}-length `y` training class vector, where NAs will
25
#'   not be used will be retained in the saved `x` matrix}
26
#' @return \item{distribution}{character; the distribution of `x` for the NB
27
#'   model}
28
#' @return \item{priors}{numeric; the class prior probabilities}
29
#' @return \item{smooth}{numeric; the value of the smoothing parameter}
30
#' @section Prior distributions:
31
#'
32
#'   Prior distributions refer to the prior probabilities assigned to the
33
#'   training classes, and the choice of prior distribution affects the
34
#'   calculation of the fitted probabilities.  The default is uniform priors,
35
#'   which sets the unconditional probability of observing the one class to be
36
#'   the same as observing any other class.
37
#'
38
#'   "Document frequency" means that the class priors will be taken from the
39
#'   relative proportions of the class documents used in the training set.  This
40
#'   approach is so common that it is assumed in many examples, such as the
41
#'   worked example from Manning, Raghavan, and Schütze (2008) below.  It is not
42
#'   the default in \pkg{quanteda}, however, since there may be nothing
43
#'   informative in the relative numbers of documents used to train a classifier
44
#'   other than the relative availability of the documents.  When training
45
#'   classes are balanced in their number of documents (usually advisable),
46
#'   however, then the empirically computed "docfreq" would be equivalent to
47
#'   "uniform" priors.
48
#'
49
#'   Setting `prior` to "termfreq" makes the priors equal to the proportions of
50
#'   total feature counts found in the grouped documents in each training class,
51
#'   so that the classes with the largest number of features are assigned the
52
#'   largest priors. If the total count of features in each training class was
53
#'   the same, then "uniform" and "termfreq" would be the same.
54
#' @section Smoothing parameter:
55
#'
56
#'   The `smooth` value is added to the feature frequencies, aggregated by
57
#'   training class, to avoid zero frequencies in any class.  This has the
58
#'   effect of giving more weight to infrequent term occurrences.
59
#' @references Manning, C.D., Raghavan, P., & Schütze, H. (2008). *An
60
#'   Introduction to Information Retrieval*. Cambridge: Cambridge University
61
#'   Press (Chapter 13). Available at
62
#'   <https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>.
63
#'
64
#'   Jurafsky, D. & Martin, J.H. (2018). From *Speech and Language Processing:
65
#'   An Introduction to Natural Language Processing, Computational Linguistics,
66
#'   and Speech Recognition*. Draft of September 23, 2018 (Chapter 6, Naive
67
#'   Bayes). Available at <https://web.stanford.edu/~jurafsky/slp3/>.
68
#'
69
#' @seealso [predict.textmodel_nb()]
70
#' @author Kenneth Benoit
71
#' @importFrom quanteda dfm_weight as.dfm
72
#' @examples
73
#' ## Example from 13.1 of _An Introduction to Information Retrieval_
74
#' txt <- c(d1 = "Chinese Beijing Chinese",
75
#'          d2 = "Chinese Chinese Shanghai",
76
#'          d3 = "Chinese Macao",
77
#'          d4 = "Tokyo Japan Chinese",
78
#'          d5 = "Chinese Chinese Chinese Tokyo Japan")
79
#' x <- quanteda::dfm(txt, tolower = FALSE)
80
#' y <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)
81
#'
82
#' ## replicate IIR p261 prediction for test set (document 5)
83
#' (tmod1 <- textmodel_nb(x, y, prior = "docfreq"))
84
#' summary(tmod1)
85
#' coef(tmod1)
86
#' predict(tmod1, type = "prob")
87
#' predict(tmod1)
88
#'
89
#' # contrast with other priors
90
#' predict(textmodel_nb(x, y, prior = "uniform"))
91
#' predict(textmodel_nb(x, y, prior = "termfreq"))
92
#'
93
#' ## replicate IIR p264 Bernoulli Naive Bayes
94
#' tmod2 <- textmodel_nb(x, y, distribution = "Bernoulli", prior = "docfreq")
95
#' predict(tmod2, newdata = x[5, ], type = "prob")
96
#' predict(tmod2, newdata = x[5, ])
97
#' @export
98
textmodel_nb <- function(x, y, smooth = 1,
99
                         prior = c("uniform", "docfreq", "termfreq"),
100
                         distribution = c("multinomial", "Bernoulli")) {
101 1
    UseMethod("textmodel_nb")
102
}
103

104
#' @export
105
textmodel_nb.default <- function(x, y, smooth = 1,
106
                                 prior = c("uniform", "docfreq", "termfreq"),
107
                                 distribution = c("multinomial", "Bernoulli")) {
108 1
    stop(friendly_class_undefined_message(class(x), "textmodel_nb"))
109
}
110

111
#' @export
112
#' @importFrom quanteda colSums rowSums dfm_weight
113
textmodel_nb.dfm <- function(x, y, smooth = 1,
114
                             prior = c("uniform", "docfreq", "termfreq"),
115
                             distribution = c("multinomial", "Bernoulli")) {
116 1
    x <- as.dfm(x)
117 1
    y <- as.factor(y)
118 1
    if (!sum(x)) stop(message_error("dfm_empty"))
119

120 1
    prior <- match.arg(prior)
121 1
    distribution <- match.arg(distribution)
122

123 1
    if (stats::var(as.numeric(y), na.rm = TRUE) == 0)
124 1
        stop("y cannot be constant")
125

126 1
    result <- list(
127 1
        call = match.call(),
128 1
        x = x, y = y,
129 1
        distribution = distribution,
130 1
        smooth = smooth
131
    )
132

133 1
    if (anyNA(y)) {
134 1
        x <- x[!is.na(y), ]
135 1
        y <- y[!is.na(y)]
136
    }
137

138 1
    if (distribution == "Bernoulli") {
139 1
        x <- dfm_weight(x, "boolean", force = TRUE)
140
    }
141

142
    # x <- dfm_group(x, groups = y, force = TRUE)
143 1
    x <- group_classes(x, y,
144 1
                       smooth = if (distribution == "multinomial") smooth else 0)
145

146 1
    freq <- rowSums(as.matrix(table(y)))
147 1
    if (prior == "uniform") {
148 1
        Pc <- rep(1 / nrow(x), nrow(x))
149 1
        names(Pc) <- rownames(x)
150 1
    } else if (prior == "docfreq") {
151 1
        Pc <- freq
152 1
        Pc <- Pc / sum(Pc)
153 0
    } else if (prior == "termfreq") {
154 0
        Pc <- rowSums(x)
155 0
        Pc <- Pc / sum(Pc)
156
    }
157

158 1
    if (distribution == "multinomial") {
159
        # PwGc <- dfm_weight(dfm_smooth(x, smooth), scheme = "prop", force = TRUE)
160
        # PwGc <- (x + smooth) / (rowSums(x) + ncol(x) * smooth)
161 1
        PwGc <- x / rowSums(x)
162 1
    } else if (distribution == "Bernoulli") {
163 1
        PwGc <- (x + smooth) / (freq + 2 * smooth)
164 1
        PwGc <- as(PwGc, "dgeMatrix")
165
    }
166

167 1
    Pc <- Pc[rownames(PwGc)]      # make sure Pc order matches the rows of PwGc
168 1
    PwGc <- as.matrix(PwGc)       # convert to ordinary matrix
169
    # names(dimnames(PwGc)) <- NULL # remove dimname labels
170

171
    ## other quantities no longer computed
172
    # PcGw <- colNorm(PwGc * base::outer(Pc, rep(1, ncol(PwGc))))
173
    # names(dimnames(PcGw))[1] <- names(dimnames(PwGc))[1] <- "classes"
174
    # Pw <- t(PwGc) %*% as.numeric(Pc)
175

176 1
    result[["priors"]] <- Pc
177 1
    result[["param"]] <- PwGc
178 1
    class(result) <- c("textmodel_nb", "textmodel", "list")
179 1
    result
180
}
181

182
# helper methods ----------------
183

184
#' Prediction from a fitted textmodel_nb object
185
#'
186
#' `predict.textmodel_nb()` implements class predictions from a fitted
187
#' Naive Bayes model. using trained Naive Bayes examples
188
#' @param object a fitted Naive Bayes textmodel
189
#' @param newdata dfm on which prediction should be made
190
#' @param type the type of predicted values to be returned; see Value
191
#' @param force make newdata's feature set conformant to the model terms
192
#' @param ... not used
193
#' @return `predict.textmodel_nb` returns either a vector of class
194
#'   predictions for each row of `newdata` (when `type = "class"`), or
195
#'   a document-by-class matrix of class probabilities (when `type =
196
#'   "probability"`) or log posterior likelihoods (when `type =
197
#'   "logposterior"`).
198
#' @seealso [textmodel_nb()]
199
#' @examples
200
#' # application to LBG (2003) example data
201
#' (tmod <- textmodel_nb(data_dfm_lbgexample, y = c("A", "A", "B", "C", "C", NA)))
202
#' predict(tmod)
203
#' predict(tmod, type = "logposterior")
204
#' @importFrom stats predict
205
#' @method predict textmodel_nb
206
#' @keywords textmodel internal
207
#' @export
208
predict.textmodel_nb <- function(object, newdata = NULL,
209
                                 type = c("class", "probability", "logposterior"),
210
                                 force = FALSE, ...) {
211 1
    unused_dots(...)
212 1
    type <- match.arg(type)
213 1
    if ("Pc" %in% names(object)) {
214 0
      names(object)[which(names(object) == "Pc")] <- "priors"
215
    }
216 1
    if ("PwGc" %in% names(object)) {
217 0
      names(object)[which(names(object) == "PwGc")] <- "param"
218
    }
219 1
    newdata <- if (!is.null(newdata)) as.dfm(newdata) else as.dfm(object$x)
220 1
    newdata <- force_conformance(newdata, colnames(object$param), force)
221

222 1
    if (object$distribution == "multinomial") {
223

224
        # log P(d|c) class conditional document likelihoods
225 1
        loglik <- Matrix::tcrossprod(newdata, log(object$param))
226

227 1
    } else if (object$distribution == "Bernoulli") {
228

229 1
        newdata <- dfm_weight(newdata, "boolean", force = TRUE)
230

231 1
        present <- log(object$param)
232 1
        nonpresent <- log(1 - object$param)
233 1
        threshold <- .Machine$double.eps
234 1
        present[is.infinite(present)] <- max(-100000, log(threshold))
235 1
        nonpresent[is.infinite(nonpresent)] <- max(-100000, log(threshold))
236

237 1
        presence_prob <- Matrix::tcrossprod(newdata, present)
238 1
        nonpresence_prob <- matrix(base::colSums(t(nonpresent)),
239 1
                                   nrow = nrow(presence_prob),
240 1
                                   ncol = ncol(presence_prob), byrow = TRUE) -
241 1
            Matrix::tcrossprod(newdata, nonpresent)
242 1
        loglik <- presence_prob + nonpresence_prob
243

244
    }
245

246
    # weight by class priors
247 1
    logpos <- t(t(loglik) + log(object$priors))
248

249 1
    if (type == "class") {
250

251 1
        classpred <- colnames(logpos)[max.col(logpos, ties.method = "first")]
252 1
        names(classpred) <- rownames(newdata)
253 1
        result <- factor(classpred, levels = names(object$priors))
254

255 1
    } else if (type == "probability") {
256

257 1
        result <- matrix(NA, ncol = ncol(logpos), nrow = nrow(logpos),
258 1
                            dimnames = dimnames(logpos))
259 1
        for (j in seq_len(ncol(logpos))) {
260 1
            base_lpl <- logpos[, j]
261 1
            result[, j] <- 1 / (1 + rowSums(exp(logpos[, -j, drop = FALSE] - base_lpl)))
262
        }
263

264 1
    } else if (type == "logposterior") {
265 1
        result <- as.matrix(logpos)
266
    }
267

268 1
    result
269
}
270

271
#' @export
272
#' @method print textmodel_nb
273
print.textmodel_nb <- function(x, ...) {
274 1
    cat("\nCall:\n")
275 1
    print(x$call)
276 1
    cat("\n",
277 1
        "Distribution:", x$distribution, ";",
278 1
        "priors:", x$prior, ";",
279 1
        "smoothing value:", x$smooth, ";",
280 1
        length(na.omit(x$y)), "training documents; ",
281 1
        ncol(na.omit(x)), "fitted features.",
282 1
        "\n", sep = " ")
283
}
284

285
#' summary method for textmodel_nb objects
286
#' @param object output from [textmodel_nb()]
287
#' @param n how many coefficients to print before truncating
288
#' @param ... additional arguments not used
289
#' @keywords textmodel internal
290
#' @method summary textmodel_nb
291
#' @export
292
summary.textmodel_nb <- function(object, n = 30, ...) {
293 1
    result <- list(
294 1
        "call" = object$call,
295 1
        "class.priors" = as.coefficients_textmodel(object$priors),
296 1
        "estimated.feature.scores" = as.coefficients_textmodel(head(coef(object), n))
297
    )
298 1
    as.summary.textmodel(result)
299
}
300

301
#' @rdname predict.textmodel_nb
302
#' @method coef textmodel_nb
303
#' @return `coef.textmodel_nb()` returns a matrix of estimated
304
#'   word likelihoods given the class.  (In earlier versions,
305
#'   this was named `PwGc`.)
306
#' @export
307
coef.textmodel_nb <- function(object, ...) {
308 1
    t(object$param)
309
}
310

311
#' @rdname predict.textmodel_nb
312
#' @method coefficients textmodel_nb
313
#' @export
314
coefficients.textmodel_nb <- function(object, ...) {
315 0
    UseMethod("coef")
316
}

Read our documentation on viewing source code .

Loading