juliasilge / tidytext
1
#' Various lexicons for English stop words
2
#'
3
#' English stop words from three lexicons, as a data frame.
4
#' The snowball and SMART sets are pulled from the tm package. Note
5
#' that words with non-ASCII characters have been removed.
6
#'
7
#' @format A data frame with 1149 rows and 2 variables:
8
#' \describe{
9
#'  \item{word}{An English word}
10
#'  \item{lexicon}{The source of the stop word. Either "onix", "SMART", or "snowball"}
11
#'  }
12
#'
13
#' @source \itemize{
14
#' \item \url{http://www.lextek.com/manuals/onix/stopwords1.html}
15
#' \item \url{https://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf}
16
#' \item \url{http://snowball.tartarus.org/algorithms/english/stop.txt}
17
#' }
18
"stop_words"
19

20
#' Get a tidy data frame of a single stopword lexicon
21
#'
22
#' Get a specific stop word lexicon via the \pkg{stopwords} package's
23
#' \link[stopwords]{stopwords} function, in a tidy format with one word per row.
24
#'
25
#' @param language The language of the stopword lexicon specified as a
26
#' two-letter ISO code, such as \code{"es"}, \code{"de"}, or \code{"fr"}.
27
#' Default is \code{"en"} for English. Use
28
#' \link[stopwords]{stopwords_getlanguages} from \pkg{stopwords} to see available
29
#' languages.
30
#' @param source The source of the stopword lexicon specified. Default is
31
#' \code{"snowball"}. Use \link[stopwords]{stopwords_getsources} from
32
#' \pkg{stopwords} to see available sources.
33
#'
34
#' @return A tibble with two columns, \code{word} and \code{lexicon}. The
35
#' parameter \code{lexicon} is "quanteda" in this case.
36
#'
37
#' @examples
38
#'
39
#' library(dplyr)
40
#' get_stopwords()
41
#' get_stopwords(source = "smart")
42
#' get_stopwords("es", "snowball")
43
#' get_stopwords("ru", "snowball")
44
#'
45
#' @export
46
#'
47
get_stopwords <- function(language = "en", source = "snowball") {
48 1
  rlang::check_installed("stopwords", "to use this function.")
49 1
  tibble(
50 1
    word = stopwords::stopwords(language = language, source = source),
51 1
    lexicon = source
52
  )
53
}

Read our documentation on viewing source code .

Loading