juliasilge / tidytext
1
#' Split a column into tokens
2
#'
3
#' Split a column into tokens, flattening the table into one-token-per-row.
4
#' This function supports non-standard evaluation through the tidyeval framework.
5
#'
6
#' @param tbl A data frame
7
#'
8
#' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
9
#' options are "words" (default), "characters", "character_shingles", "ngrams",
10
#' "skip_ngrams", "sentences", "lines", "paragraphs", "regex", "tweets"
11
#' (tokenization by word that preserves usernames, hashtags, and URLS ), and
12
#' "ptb" (Penn Treebank). If a function, should take a character vector and
13
#' return a list of character vectors of the same length.
14
#'
15
#' @param format Either "text", "man", "latex", "html", or "xml". When the
16
#' format is "text", this function uses the tokenizers package. If not "text",
17
#' this uses the hunspell tokenizer, and can tokenize only by "word"
18
#'
19
#' @param to_lower Whether to convert tokens to lowercase. If tokens include
20
#' URLS (such as with \code{token = "tweets"}), such converted URLs may no
21
#' longer be correct.
22
#'
23
#' @param drop Whether original input column should get dropped. Ignored
24
#' if the original input and new output column have the same name.
25
#'
26
#' @param output Output column to be created as string or symbol.
27
#'
28
#' @param input Input column that gets split as string or symbol.
29
#'
30
#'   The output/input arguments are passed by expression and support
31
#'   \link[rlang]{quasiquotation}; you can unquote strings and symbols.
32
#'
33
#' @param collapse A character vector of variables to collapse text across,
34
#'  or `NULL`.
35
#'
36
#'   For tokens like n-grams or sentences, text can be collapsed across rows
37
#'   within variables specified by `collapse` before tokenization. At tidytext
38
#'   0.2.7, the default behavior for `collapse = NULL` changed to be more
39
#'   consistent. The new behavior is that text is _not_ collapsed for `NULL`.
40
#'
41
#'   Grouping data specifies variables to collapse across in the same way as
42
#'   `collapse` but you **cannot** use both the `collapse` argument and
43
#'   grouped data. Collapsing applies mostly to `token` options of "ngrams",
44
#'   "skip_ngrams", "sentences", "lines", "paragraphs", or "regex".
45
#'
46
#' @param ... Extra arguments passed on to \link[tokenizers]{tokenizers}, such
47
#' as \code{strip_punct} for "words" and "tweets", \code{n} and \code{k} for
48
#' "ngrams" and "skip_ngrams", \code{strip_url} for "tweets", and
49
#' \code{pattern} for "regex".
50
#'
51
#' @details If format is anything other than "text", this uses the
52
#' \code{\link[hunspell]{hunspell_parse}} tokenizer instead of the tokenizers package.
53
#' This does not yet have support for tokenizing by any unit other than words.
54
#'
55
#' @import dplyr
56
#' @import rlang
57
#' @import tokenizers
58
#' @import janeaustenr
59
#' @importFrom vctrs vec_rep_each
60
#' @importFrom vctrs vec_slice
61
#' @export
62
#'
63
#' @name unnest_tokens
64
#'
65
#' @examples
66
#'
67
#' library(dplyr)
68
#' library(janeaustenr)
69
#'
70
#' d <- tibble(txt = prideprejudice)
71
#' d
72
#'
73
#' d %>%
74
#'   unnest_tokens(word, txt)
75
#'
76
#' d %>%
77
#'   unnest_tokens(sentence, txt, token = "sentences")
78
#'
79
#' d %>%
80
#'   unnest_tokens(ngram, txt, token = "ngrams", n = 2)
81
#'
82
#' d %>%
83
#'   unnest_tokens(chapter, txt, token = "regex", pattern = "Chapter [\\\\d]")
84
#'
85
#' d %>%
86
#'   unnest_tokens(shingle, txt, token = "character_shingles", n = 4)
87
#'
88
#' # custom function
89
#' d %>%
90
#'   unnest_tokens(word, txt, token = stringr::str_split, pattern = " ")
91
#'
92
#' # tokenize HTML
93
#' h <- tibble(row = 1:2,
94
#'                 text = c("<h1>Text <b>is</b>", "<a href='example.com'>here</a>"))
95
#'
96
#' h %>%
97
#'   unnest_tokens(word, text, format = "html")
98
#'
99
unnest_tokens <- function(tbl, output, input, token = "words",
100
                          format = c(
101
                            "text", "man", "latex",
102
                            "html", "xml"
103
                          ),
104
                          to_lower = TRUE, drop = TRUE,
105
                          collapse = NULL, ...) {
106 1
  output <- enquo(output)
107 1
  input <- enquo(input)
108 1
  format <- arg_match(format)
109

110 1
  tokenfunc <- find_function(token, format, to_lower, ...)
111

112 1
  if (!is_null(collapse)) {
113

114 1
    if (is_logical(collapse)) {
115 0
      lifecycle::deprecate_stop(
116 0
        "0.2.7",
117 0
        "tidytext::unnest_tokens(collapse = 'must be `NULL` or a character vector')"
118
      )
119
    }
120

121 1
    if (is_grouped_df(tbl)) {
122 1
      rlang::abort(
123 1
        paste0("Use the `collapse` argument or grouped data, but not both.")
124
      )
125
    }
126 1
    if (any(!purrr::map_lgl(tbl, is_atomic))) {
127 1
      rlang::abort(
128 1
        paste0("If collapse != NULL (such as for unnesting by sentence or paragraph),\n",
129 1
               "unnest_tokens needs all input columns to be atomic vectors (not lists)")
130
      )
131
    }
132

133 1
    tbl <- group_by(tbl, !!!syms(collapse))
134
  }
135

136

137 1
  if (is_grouped_df(tbl)) {
138

139 1
    tbl <- tbl %>%
140 1
      ungroup() %>%
141 1
      mutate(new_groups = cumsum(c(1, diff(group_indices(tbl)) != 0))) %>%
142 1
      group_by(new_groups, !!!groups(tbl)) %>%
143 1
      summarise(!!input := stringr::str_c(!!input, collapse = "\n")) %>%
144 1
      group_by(!!!groups(tbl)) %>%
145 1
      dplyr::select(-new_groups)
146

147 1
    if(!is_null(collapse)) {
148 1
      tbl <- ungroup(tbl)
149
    }
150

151
  }
152

153 1
  col <- pull(tbl, !!input)
154 1
  output_lst <- tokenfunc(col, ...)
155

156 1
  if (!(is.list(output_lst) && length(output_lst) == nrow(tbl))) {
157 1
    rlang::abort(
158 1
      "Expected output of tokenizing function to be a list of length ",
159 1
      nrow(tbl)
160
    )
161
  }
162

163 1
  output <- quo_name(output)
164 1
  input <- quo_name(input)
165

166 1
  tbl_indices <- vec_rep_each(seq_len(nrow(tbl)), lengths(output_lst))
167 1
  ret <- vec_slice(tbl, tbl_indices)
168 1
  ret[[output]] <- flatten_chr(output_lst)
169

170 1
  if (to_lower) {
171 1
    if (!is_function(token))
172 1
      if(token == "tweets") {
173 1
        rlang::inform("Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.")
174
      }
175 1
    ret[[output]] <- stringr::str_to_lower(ret[[output]])
176
  }
177

178
  # For data.tables we want this to hit the result and be after the result
179
  # has been assigned, just to make sure that we don't reduce the data.table
180
  # to 0 rows before inserting the output.
181 1
  if (drop && output != input) {
182 1
    ret[[input]] <- NULL
183
  }
184

185 1
  ret
186
}
187

188
find_function <- function(token, format, to_lower, ...) {
189

190 1
  if (is_function(token)) {
191 1
    tokenfunc <- token
192 1
  } else if (token %in% c(
193 1
    "word", "character",
194 1
    "character_shingle", "ngram",
195 1
    "skip_ngram", "sentence", "line",
196 1
    "paragraph", "tweet"
197
  )) {
198 1
    rlang::abort(paste0(
199 1
      "Error: Token must be a supported type, or a function that takes a character vector as input",
200 1
      "\nDid you mean token = ", token, "s?"
201
    ))
202 1
  } else if (format != "text") {
203 1
    if (token != "words") {
204 1
      rlang::abort("Cannot tokenize by any unit except words when format is not text")
205
    }
206 1
    tokenfunc <- function(col, ...) hunspell::hunspell_parse(col,
207 1
                                                             format = format
208
    )
209
  } else {
210 1
    if (is_null(collapse) && token %in% c(
211 1
      "ngrams", "skip_ngrams", "sentences",
212 1
      "lines", "paragraphs", "regex",
213 1
      "character_shingles"
214
    )) {
215 0
      lifecycle::deprecate_warn(
216 0
        "0.2.7",
217 0
        "tidytext::unnest_tokens(collapse = 'changed its default behavior for `NULL`')"
218
      )
219
    }
220 1
    tf <- get(paste0("tokenize_", token))
221 1
    if (token %in% c(
222 1
      "characters", "words", "ngrams", "skip_ngrams",
223 1
      "tweets", "ptb"
224
    )) {
225 1
      tokenfunc <- function(col, ...) tf(col, lowercase = to_lower, ...)
226
    } else {
227 1
      tokenfunc <- tf
228
    }
229
  }
230

231 1
  tokenfunc
232
}
233

Read our documentation on viewing source code .

Loading