juliasilge / tidytext
1
#' Tidy a Corpus object from the tm package
2
#'
3
#' Tidy a Corpus object from the tm package. Returns a data frame
4
#' with one-row-per-document, with a \code{text} column containing
5
#' the document's text, and one column for each local (per-document)
6
#' metadata tag. For corpus objects from the quanteda package,
7
#' see \code{\link{tidy.corpus}}.
8
#'
9
#' @param x A Corpus object, such as a VCorpus or PCorpus
10
#' @param collapse A string that should be used to
11
#' collapse text within each corpus (if a document has multiple lines).
12
#' Give NULL to not collapse strings, in which case a corpus
13
#' will end up as a list column if there are multi-line documents.
14
#' @param ... Extra arguments, not used
15
#'
16
#' @examples
17
#'
18
#' library(dplyr)   # displaying tbl_dfs
19
#'
20
#' if (requireNamespace("tm", quietly = TRUE)) {
21
#'   library(tm)
22
#'   #' # tm package examples
23
#'   txt <- system.file("texts", "txt", package = "tm")
24
#'   ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"),
25
#'                   readerControl = list(language = "lat"))
26
#'
27
#'   ovid
28
#'   tidy(ovid)
29
#'
30
#'   # choose different options for collapsing text within each
31
#'   # document
32
#'   tidy(ovid, collapse = "")$text
33
#'   tidy(ovid, collapse = NULL)$text
34
#'
35
#'   # another example from Reuters articles
36
#'   reut21578 <- system.file("texts", "crude", package = "tm")
37
#'   reuters <- VCorpus(DirSource(reut21578),
38
#'                      readerControl = list(reader = readReut21578XMLasPlain))
39
#'   reuters
40
#'
41
#'   tidy(reuters)
42
#' }
43
#'
44
#' @export
45
tidy.Corpus <- function(x, collapse = "\n", ...) {
46 1
  local_meta <- NLP::meta(x, type = "local") %>%
47 1
    purrr::transpose()
48

49 1
  columns <- purrr::map(local_meta, function(m) {
50 1
    lengths <- purrr::map_dbl(m, length)
51 1
    if (any(lengths > 1)) {
52
      # keep as a list column
53 0
      return(m)
54
    }
55 1
    m <- purrr::map_at(m, which(lengths == 0), ~ NA)
56

57 1
    ret <- unname(do.call(c, m))
58
    ## tbl_df() doesn't support POSIXlt format
59
    ## https://github.com/hadley/dplyr/issues/1382
60 1
    if (inherits(ret, "POSIXlt")) {
61 1
      ret <- as.POSIXct(ret)
62
    }
63 1
    ret
64
  })
65

66 1
  ret <- as_tibble(columns)
67

68
  # most importantly, add text
69 1
  text <- purrr::map(as.list(x), as.character)
70

71 1
  if (all(purrr::map(text, length) == 1)) {
72 0
    text <- unlist(text)
73 1
  } else if (!is.null(collapse)) {
74 1
    text <- purrr::map_chr(text, stringr::str_c, collapse = collapse)
75
  }
76 1
  ret$text <- text
77

78 1
  ret
79
}
80

81

82
#' Tidiers for a corpus object from the quanteda package
83
#'
84
#' Tidy a corpus object from the quanteda package. \code{tidy} returns a
85
#' tbl_df with one-row-per-document, with a \code{text} column containing
86
#' the document's text, and one column for each document-level metadata.
87
#' \code{glance} returns a one-row tbl_df with corpus-level metadata,
88
#' such as source and created. For Corpus objects from the tm package,
89
#' see \code{\link{tidy.Corpus}}.
90
#'
91
#' @param x A Corpus object, such as a VCorpus or PCorpus
92
#' @param ... Extra arguments, not used
93
#'
94
#' @importFrom generics glance
95
#'
96
#' @details For the most part, the \code{tidy} output is equivalent to the
97
#' "documents" data frame in the corpus object, except that it is converted
98
#' to a tbl_df, and \code{texts} column is renamed to \code{text}
99
#' to be consistent with other uses in tidytext.
100
#'
101
#' Similarly, the \code{glance} output is simply the "metadata" object,
102
#' with NULL fields removed and turned into a one-row tbl_df.
103
#'
104
#' @examples
105
#'
106
#' if (requireNamespace("quanteda", quietly = TRUE)) {
107
#'  data("data_corpus_inaugural", package = "quanteda")
108
#'
109
#'  data_corpus_inaugural
110
#'
111
#'  tidy(data_corpus_inaugural)
112
#' }
113
#'
114
#' @name corpus_tidiers
115
#'
116
#' @export
117
tidy.corpus <- function(x, ...) {
118 1
  tibble::as_tibble(data.frame(
119 1
    text = as.character(x),
120 1
    quanteda::docvars(x),
121 1
    stringsAsFactors = FALSE
122
  ))
123
}
124

125

126
#' @rdname corpus_tidiers
127
#' @export
128
glance.corpus <- function(x, ...) {
129 1
  md <- purrr::compact(quanteda::meta(x))
130

131
  # turn vectors into list columns
132 1
  md <- purrr::map_if(md, ~ length(.) > 1, list)
133

134 1
  as_tibble(md)
135
}
136

137
#' @export
138
generics::glance

Read our documentation on viewing source code .

Loading