ropensci / tokenizers

@@ -18,7 +18,7 @@
Loading
18 18
19 19
remove_stopwords <- function(x, stopwords) {
20 20
  out <- x[!x %in% stopwords]
21 -
  if(!length(out)){
21 +
  if (!length(out)) {
22 22
    return(NA_character_)
23 23
  }
24 24
  return(out)

@@ -121,7 +121,7 @@
Loading
121 121
get_valid_skips <- function(n, k) {
122 122
  max_dist <- k * (n - 1) + (n - 1)
123 123
  total_combinations <- choose(max_dist, n - 1)
124 -
  if (total_combinations > 5e3){
124 +
  if (total_combinations > 5e3) {
125 125
    warning("Input n and k will produce a very large number of skip n-grams")
126 126
  }
127 127

@@ -18,89 +18,85 @@
Loading
18 18
}
19 19
20 20
#' @export
21 -
tokenize_tweets.data.frame <-
22 -
  function(x,
23 -
           lowercase = TRUE,
24 -
           stopwords = NULL,
25 -
           strip_punct = TRUE,
26 -
           strip_url = FALSE,
27 -
           simplify = FALSE) {
28 -
    x <- corpus_df_as_corpus_vector(x)
29 -
    tokenize_tweets(x, lowercase, stopwords, strip_punct, strip_url, simplify)
30 -
  }
21 +
tokenize_tweets.data.frame <- function(x,
22 +
                                       lowercase = TRUE,
23 +
                                       stopwords = NULL,
24 +
                                       strip_punct = TRUE,
25 +
                                       strip_url = FALSE,
26 +
                                       simplify = FALSE) {
27 +
  x <- corpus_df_as_corpus_vector(x)
28 +
  tokenize_tweets(x, lowercase, stopwords, strip_punct, strip_url, simplify)
29 +
}
31 30
32 31
#' @export
33 -
tokenize_tweets.default <-
34 -
  function(x,
35 -
           lowercase = TRUE,
36 -
           stopwords = NULL,
37 -
           strip_punct = TRUE,
38 -
           strip_url = FALSE,
39 -
           simplify = FALSE) {
40 -
    check_input(x)
41 -
    named <- names(x)
32 +
tokenize_tweets.default <- function(x,
33 +
                                    lowercase = TRUE,
34 +
                                    stopwords = NULL,
35 +
                                    strip_punct = TRUE,
36 +
                                    strip_url = FALSE,
37 +
                                    simplify = FALSE) {
38 +
  check_input(x)
39 +
  named <- names(x)
42 40
43 -
    # split on white space
44 -
    out <- stri_split_charclass(x, "\\p{WHITE_SPACE}")
41 +
  # split on white space
42 +
  out <- stri_split_charclass(x, "\\p{WHITE_SPACE}")
45 43
46 -
    # get document indexes to vectorize tokens
47 -
    doc_lengths <- cumsum(lengths(out))
48 -
    docindex <- c(0, doc_lengths)
49 -
    # convert the list into a vector - avoids all those mapplys
50 -
    out <- unlist(out)
44 +
  # get document indexes to vectorize tokens
45 +
  doc_lengths <- cumsum(lengths(out))
46 +
  docindex <- c(0, doc_lengths)
47 +
  # convert the list into a vector - avoids all those mapplys
48 +
  out <- unlist(out)
51 49
52 -
    # get the index of twitter hashtags and usernames
53 -
    index_twitter <- stri_detect_regex(out, "^#[A-Za-z]+\\w*|^@\\w+")
54 -
    # get the index of http(s) URLs
55 -
    index_url <- stri_detect_regex(out, "^http")
50 +
  # get the index of twitter hashtags and usernames
51 +
  index_twitter <- stri_detect_regex(out, "^#[A-Za-z]+\\w*|^@\\w+")
52 +
  # get the index of http(s) URLs
53 +
  index_url <- stri_detect_regex(out, "^http")
56 54
57 -
    if (strip_url) {
58 -
      out[index_url] <- ""
59 -
    }
55 +
  if (strip_url) {
56 +
    out[index_url] <- ""
57 +
  }
60 58
61 -
    if (lowercase) {
62 -
      out[!(index_twitter | index_url)] <-
63 -
        stri_trans_tolower(out[!(index_twitter | index_url)])
64 -
    }
59 +
  if (lowercase) {
60 +
    out[!(index_twitter | index_url)] <-
61 +
      stri_trans_tolower(out[!(index_twitter | index_url)])
62 +
  }
65 63
66 -
    if (strip_punct) {
67 -
      twitter_chars <- stri_sub(out[index_twitter], 1, 1)
68 -
      out[!index_url] <-
69 -
        stri_replace_all_charclass(out[!index_url], "\\p{P}", "")
70 -
      #stri_replace_all_charclass(out[!index_url], "[^\\P{P}#@]", "")
71 -
      out[index_twitter] <- paste0(twitter_chars, out[index_twitter])
72 -
    } else {
73 -
      # all except URLs
74 -
      out[!index_url] <-
75 -
        stri_split_boundaries(out[!index_url], type = "word")
76 -
      # rejoin the hashtags and usernames
77 -
      out[index_twitter] <-
78 -
        lapply(out[index_twitter], function(toks) {
79 -
          toks[2] <- paste0(toks[1], toks[2])
80 -
          toks[-1]
81 -
        })
82 -
    }
64 +
  if (!is.null(stopwords))
65 +
    out <- sapply(out, remove_stopwords, stopwords, USE.NAMES = FALSE)
83 66
84 -
    # convert the vector back to a list
85 -
    out <- split(out,
86 -
                 cut(
87 -
                   seq_along(out),
88 -
                   docindex,
89 -
                   include.lowest = FALSE,
90 -
                   labels = named
91 -
                 ))
92 -
    # in case !strip_punct, otherwise has no effect
93 -
    out <- lapply(out, unlist)
67 +
  if (strip_punct) {
68 +
    twitter_chars <- stri_sub(out[index_twitter], 1, 1)
69 +
    out[!index_url] <-
70 +
      stri_replace_all_charclass(out[!index_url], "\\p{P}", "")
71 +
    out[index_twitter] <- paste0(twitter_chars, out[index_twitter])
72 +
  } else {
73 +
    # all except URLs
74 +
    out[!index_url] <-
75 +
      stri_split_boundaries(out[!index_url], type = "word")
76 +
    # rejoin the hashtags and usernames
77 +
    out[index_twitter] <-
78 +
      lapply(out[index_twitter], function(toks) {
79 +
        toks[2] <- paste0(toks[1], toks[2])
80 +
        toks[-1]
81 +
      })
82 +
  }
94 83
95 -
    names(out) <- named
84 +
  # convert the vector back to a list
85 +
  out <- split(out,
86 +
               cut(
87 +
                 seq_along(out),
88 +
                 docindex,
89 +
                 include.lowest = TRUE,
90 +
                 labels = named
91 +
               ))
92 +
  # in case !strip_punct, otherwise has no effect
93 +
  out <- lapply(out, unlist)
96 94
97 -
    # remove stopwords
98 -
    if (!is.null(stopwords))
99 -
      out <- lapply(out, remove_stopwords, stopwords)
95 +
  names(out) <- named
100 96
101 -
    # remove any blanks (from removing URLs)
102 -
    out <- lapply(out, function(toks)
103 -
      toks[toks != ""])
97 +
  # remove any blanks (from removing URLs)
98 +
  out <- lapply(out, function(toks)
99 +
    toks[toks != "" & !is.na(toks)])
104 100
105 -
    simplify_list(out, simplify)
106 -
  }
101 +
  simplify_list(out, simplify)
102 +
}

@@ -45,11 +45,11 @@
Loading
45 45
            length(x) == 1)
46 46
  words <- tokenize_words(x, simplify = TRUE, ...)
47 47
48 -
  if(length(words) <= chunk_size) {
48 +
  if (length(words) <= chunk_size) {
49 49
    chunks <- x
50 50
  }
51 51
52 -
  chunks <- split(words, ceiling(seq_along(words)/chunk_size))
52 +
  chunks <- split(words, ceiling(seq_along(words) / chunk_size))
53 53
54 54
  if (!is.null(doc_id)) {
55 55
    num_chars <- stringi::stri_length(length(chunks))
Files Coverage
R 98.44%
src 98.89%
Project Totals (12 files) 98.54%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading