quanteda / quanteda.textmodels
Showing 8 of 106 files from the diff.
Other files ignored by Codecov
NAMESPACE has changed.
man/textmodels.Rd has changed.
.Rbuildignore has changed.
tests/testthat.R has changed.
README.md has changed.
NEWS.md has changed.
cran-comments.md has changed.
inst/WORDLIST has changed.
DESCRIPTION has changed.

@@ -30,7 +30,8 @@
Loading
30 30
#' @return `textmodel_ca()` returns a fitted CA textmodel that is a special
31 31
#' class of \pkg{ca} object.
32 32
#' @examples
33 -
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
33 +
#' library("quanteda")
34 +
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
34 35
#' tmod <- textmodel_ca(dfmat)
35 36
#' summary(tmod)
36 37
#' @seealso [coef.textmodel_lsa()], [ca][ca::ca]

@@ -1,4 +1,4 @@
Loading
1 -
#' (faster) Linear SVM classifier for texts
1 +
#' \[experimental\] Linear SVM classifier for texts
2 2
#'
3 3
#' Fit a fast linear SVM classifier for sparse text matrices, using svmlin C++
4 4
#' code written by Vikas Sindhwani and S. Sathiya Keerthi.  This method
@@ -6,6 +6,11 @@
Loading
6 6
#' described in Sindhwani and Keerthi (2006). Currently,
7 7
#' `textmodel_svmlin()` only works for two-class problems.
8 8
#'
9 +
#' @section Warning:
10 +
#' This function is marked experimental since it's not fully working yet in a
11 +
#' way that translates into more standard SVM parameters that we understand. Use
12 +
#' with caution after reading the Sindhwani and Keerthi (2006) paper.
13 +
#'
9 14
#' @param x the [dfm] on which the model will be fit.  Does not need to contain
10 15
#'   only the training documents.
11 16
#' @param y vector of training labels associated with each document identified
@@ -31,15 +36,15 @@
Loading
31 36
#' @seealso [predict.textmodel_svmlin()]
32 37
#' @examples
33 38
#' # use Lenihan for govt class and Bruton for opposition
34 -
#' quanteda::docvars(data_corpus_irishbudget2010, "govtopp") <-
35 -
#'     c("Govt", "Opp", rep(NA, 12))
36 -
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
39 +
#' library("quanteda")
40 +
#' docvars(data_corpus_irishbudget2010, "govtopp") <- c("Govt", "Opp", rep(NA, 12))
41 +
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
37 42
#'
38 -
#' tmod <- textmodel_svmlin(dfmat, y = quanteda::docvars(dfmat, "govtopp"))
43 +
#' tmod <- textmodel_svmlin(dfmat, y = dfmat$govtopp)
39 44
#' predict(tmod)
40 45
#' @importFrom quanteda dfm_group as.dfm
41 46
#' @importFrom stats na.omit predict
42 -
#' @keywords textmodel
47 +
#' @keywords textmodel internal experimental
43 48
#' @export
44 49
textmodel_svmlin <- function(x, y, intercept = TRUE, # x_u = NULL,
45 50
                             lambda = 1,

@@ -15,28 +15,27 @@
Loading
15 15
#' @seealso [`cv.glmnet()`][glmnet::cv.glmnet()], [predict.textmodel_lr()],
16 16
#'   [coef.textmodel_lr()]
17 17
#' @references
18 -
#' Friedman, J., Hastie, T., & Tibshirani, R. (2010). [Regularization Paths for
19 -
#' Generalized Linear Models via Coordinate
20 -
#' Descent](http://dx.doi.org/10.18637/jss.v033.i01). _Journal of Statistical
21 -
#' Software_ 33(1), 1-22.
18 +
#' Friedman, J., Hastie, T., & Tibshirani, R. (2010). Regularization Paths for
19 +
#' Generalized Linear Models via Coordinate Descent. _Journal of Statistical
20 +
#' Software_ 33(1), 1-22.  \doi{10.18637/jss.v033.i01}
22 21
#' @examples
23 22
#' ## Example from 13.1 of _An Introduction to Information Retrieval_
24 -
#' corp <- quanteda::corpus(c(d1 = "Chinese Beijing Chinese",
25 -
#'                            d2 = "Chinese Chinese Shanghai",
26 -
#'                            d3 = "Chinese Macao",
27 -
#'                            d4 = "Tokyo Japan Chinese",
28 -
#'                            d5 = "London England Chinese",
29 -
#'                            d6 = "Chinese Chinese Chinese Tokyo Japan"),
30 -
#'                          docvars = data.frame(train = factor(c("Y", "Y", "Y",
31 -
#'                                                                "N", "N", NA))))
32 -
#' dfmat <- quanteda::dfm(corp, tolower = FALSE)
23 +
#' library("quanteda")
24 +
#' corp <- corpus(c(d1 = "Chinese Beijing Chinese",
25 +
#'                  d2 = "Chinese Chinese Shanghai",
26 +
#'                  d3 = "Chinese Macao",
27 +
#'                  d4 = "Tokyo Japan Chinese",
28 +
#'                  d5 = "London England Chinese",
29 +
#'                  d6 = "Chinese Chinese Chinese Tokyo Japan"),
30 +
#'                docvars = data.frame(train = factor(c("Y", "Y", "Y", "N", "N", NA))))
31 +
#' dfmat <- dfm(tokens(corp), tolower = FALSE)
33 32
#'
34 33
#' ## simulate bigger sample as classification on small samples is problematic
35 34
#' set.seed(1)
36 -
#' dfmat <- quanteda::dfm_sample(dfmat, 50, replace = TRUE)
35 +
#' dfmat <- dfm_sample(dfmat, 50, replace = TRUE)
37 36
#'
38 37
#' ## train model
39 -
#' (tmod1 <- textmodel_lr(dfmat, quanteda::docvars(dfmat, "train")))
38 +
#' (tmod1 <- textmodel_lr(dfmat, docvars(dfmat, "train")))
40 39
#' summary(tmod1)
41 40
#' coef(tmod1)
42 41
#'

@@ -79,7 +79,7 @@
Loading
79 79
#'
80 80
#' \dontrun{
81 81
#' library("quanteda")
82 -
#' dfmat <- dfm(data_corpus_irishbudget2010)
82 +
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
83 83
#' (tmod2 <- textmodel_wordfish(dfmat, dir = c(6, 5)))
84 84
#' (tmod3 <- textmodel_wordfish(dfmat, dir = c(6, 5),
85 85
#'                              dispersion = "quasipoisson", dispersion_floor = 0))

@@ -12,6 +12,9 @@
Loading
12 12
#'   uses default; `"docfreq"` weights by the number of training examples,
13 13
#'   and `"termfreq"` by the relative sizes of the training classes in
14 14
#'   terms of their total lengths in tokens.
15 +
#' @param type argument passed to the `type` argument in
16 +
#'   [LiblineaR::LiblineaR()]; default is `1` for L2-regularized L2-loss support
17 +
#'   vector classification (dual)
15 18
#' @param ... additional arguments passed to [LiblineaR::LiblineaR()]
16 19
#' @references
17 20
#' R. E. Fan, K. W. Chang, C. J. Hsieh, X. R. Wang, and C. J. Lin. (2008)
@@ -21,25 +24,24 @@
Loading
21 24
#' @seealso [LiblineaR::LiblineaR()] [predict.textmodel_svm()]
22 25
#' @examples
23 26
#' # use party leaders for govt and opposition classes
24 -
#' quanteda::docvars(data_corpus_irishbudget2010, "govtopp") <-
27 +
#' library("quanteda")
28 +
#' docvars(data_corpus_irishbudget2010, "govtopp") <-
25 29
#'     c(rep(NA, 4), "Gov", "Opp", NA, "Opp", NA, NA, NA, NA, NA, NA)
26 -
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
27 -
#' tmod <- textmodel_svm(dfmat, y = quanteda::docvars(dfmat, "govtopp"))
30 +
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
31 +
#' tmod <- textmodel_svm(dfmat, y = dfmat$govtopp)
28 32
#' predict(tmod)
29 -
#' predict(tmod, type = "probability")
30 33
#'
31 34
#' # multiclass problem - all party leaders
32 35
#' tmod2 <- textmodel_svm(dfmat,
33 36
#'     y = c(rep(NA, 3), "SF", "FF", "FG", NA, "LAB", NA, NA, "Green", rep(NA, 3)))
34 37
#' predict(tmod2)
35 -
#' predict(tmod2, type = "probability")
36 38
#' @export
37 -
textmodel_svm <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), ...) {
39 +
textmodel_svm <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), type = 1, ...) {
38 40
    UseMethod("textmodel_svm")
39 41
}
40 42
41 43
#' @export
42 -
textmodel_svm.default <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), ...) {
44 +
textmodel_svm.default <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), type = 1, ...) {
43 45
    stop(friendly_class_undefined_message(class(x), "textmodel_svm"))
44 46
}
45 47
@@ -47,7 +49,7 @@
Loading
47 49
#' @importFrom SparseM as.matrix.csr
48 50
#' @importFrom quanteda dfm_weight dfm_group dfm_trim as.dfm
49 51
#' @export
50 -
textmodel_svm.dfm <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), ...) {
52 +
textmodel_svm.dfm <- function(x, y, weight = c("uniform", "docfreq", "termfreq"), type = 1, ...) {
51 53
    x <- as.dfm(x)
52 54
    if (!sum(x)) stop(message_error("dfm_empty"))
53 55
    call <- match.call()
@@ -73,7 +75,7 @@
Loading
73 75
    }
74 76
75 77
    svmlinfitted <- LiblineaR::LiblineaR(as.matrix.csr.dfm(x_train),
76 -
                                         target = y_train, wi = wi, ...)
78 +
                                         target = y_train, wi = wi, type = type, ...)
77 79
    colnames(svmlinfitted$W)[seq_along(featnames(x_train))] <- featnames(x_train)
78 80
    result <- list(
79 81
        x = x, y = y,
@@ -132,14 +134,14 @@
Loading
132 134
    else
133 135
        force_conformance(data, model_featnames, force)
134 136
135 -
    pred_y <- predict(object$svmlinfitted,
136 -
                      newx = as.matrix.csr.dfm(data),
137 -
                      proba = (type == "probability"))
138 -
139 137
    if (type == "class") {
138 +
        pred_y <- predict(object$svmlinfitted, newx = as.matrix.csr.dfm(data), proba = FALSE)
140 139
        pred_y <- pred_y$predictions
141 140
        names(pred_y) <- docnames(data)
142 141
    } else if (type == "probability") {
142 +
        if (object$type != 0)
143 +
            stop("probability predictions not implemented for this model type")
144 +
        pred_y <- predict(object$svmlinfitted, newx = as.matrix.csr.dfm(data), proba = TRUE)
143 145
        pred_y <- pred_y$probabilities
144 146
        rownames(pred_y) <- docnames(data)
145 147
    }

@@ -71,12 +71,13 @@
Loading
71 71
#' @importFrom quanteda dfm_weight as.dfm
72 72
#' @examples
73 73
#' ## Example from 13.1 of _An Introduction to Information Retrieval_
74 +
#' library("quanteda")
74 75
#' txt <- c(d1 = "Chinese Beijing Chinese",
75 76
#'          d2 = "Chinese Chinese Shanghai",
76 77
#'          d3 = "Chinese Macao",
77 78
#'          d4 = "Tokyo Japan Chinese",
78 79
#'          d5 = "Chinese Chinese Chinese Tokyo Japan")
79 -
#' x <- quanteda::dfm(txt, tolower = FALSE)
80 +
#' x <- dfm(tokens(txt), tolower = FALSE)
80 81
#' y <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)
81 82
#'
82 83
#' ## replicate IIR p261 prediction for test set (document 5)

@@ -1,8 +1,8 @@
Loading
1 1
#' Influence plot for text scaling models
2 -
#' 
3 -
#' Plot the results of a fitted scaling model, from (e.g.) a predicted 
2 +
#'
3 +
#' Plot the results of a fitted scaling model, from (e.g.) a predicted
4 4
#' [textmodel_affinity] model.
5 -
#' @param x the object output from `influence()` run on the 
5 +
#' @param x the object output from `influence()` run on the
6 6
#'   fitted or predicted scaling model object to be plotted
7 7
#' @param n the number of features whose influence will be plotted
8 8
#' @param ... additional arguments passed to [plot()]
@@ -11,10 +11,10 @@
Loading
11 11
#' @export
12 12
#' @author Patrick Perry and Kenneth Benoit
13 13
#' @seealso [influence.predict.textmodel_affinity()]
14 -
#' @keywords textplot
14 +
#' @keywords textplot internal
15 15
#' @examples
16 16
#' tmod <- textmodel_affinity(data_dfm_lbgexample, y = c("L", NA, NA, NA, "R", NA))
17 -
#' pred <- predict(tmod) 
17 +
#' pred <- predict(tmod)
18 18
#' textplot_influence(influence(pred))
19 19
textplot_influence <- function(x, n = 30, ...) {
20 20
    UseMethod("textplot_influence")
@@ -41,13 +41,13 @@
Loading
41 41
    influence <- x$median[x$support]
42 42
    direction <- x$direction[x$support]
43 43
    imbalance <- influence / rate
44 -
    
44 +
45 45
    x <- log10(rate)
46 46
    y <- 100 * influence
47 47
    col <- as.integer(direction)
48 48
    plot(x, y, type = "n", xlab=expression(Log[10]("Median Rate")),
49 49
         ylab=expression("Median Influence" %*% 100))
50 -
    
50 +
51 51
    if (!is.null(n) && !is.na(n)) {
52 52
        n <- min(n, nrow(x))
53 53
        subset <- rank(-influence, ties.method="first")  <= n
@@ -56,7 +56,7 @@
Loading
56 56
    }
57 57
    points(x[!subset], y[!subset], cex=0.5, col=col[!subset])
58 58
    text(x[subset], y[subset], word[subset], cex=0.75, col=col[subset])
59 -
    
59 +
60 60
    levels <- levels(direction)
61 61
    legend("topleft", legend = levels, fill = seq_along(levels), inset=0.05)
62 62
}

@@ -22,7 +22,8 @@
Loading
22 22
#'   by Latent Semantic Analysis](https://search.proquest.com/docview/1301252034). *Journal of the American Society for
23 23
#'   Information Science*, 41(6): 391.
24 24
#' @examples
25 -
#' dfmat <- quanteda::dfm(data_corpus_irishbudget2010)
25 +
#' library("quanteda")
26 +
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
26 27
#' # create an LSA space and return its truncated representation in the low-rank space
27 28
#' tmod <- textmodel_lsa(dfmat[1:10, ])
28 29
#' head(tmod$docs)
Files Coverage
R 82.33%
src 40.94%
Project Totals (19 files) 58.72%

No yaml found.

Create your codecov.yml to customize your Codecov experience

Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading