quanteda / spacyr
1
#' Extract noun phrases from texts using spaCy
2
#' 
3
#' This function extracts noun phrases from documents, based on the
4
#' \code{noun_chunks} attributes of documents objects parsed by spaCy (see
5
#' \url{https://spacy.io/usage/linguistic-features#noun-chunks}).
6
#' 
7
#' @param x a character object or a TIF-compliant corpus data.frame (see
8
#'   \url{https://github.com/ropensci/tif})
9
#' @inheritParams spacy_parse
10
#' @param output type of returned object, either \code{"data.frame"} or
11
#'   \code{"list"}
12
#' @param ... unused
13
#' @details When the option \code{output = "data.frame"} is selected, the
14
#'   function returns a \code{data.frame} with the following fields.
15
#' \describe{\item{\code{text}}{contents of noun-phrase}
16
#' \item{\code{root_text}}{contents of root token}
17
#' \item{\code{start_id}}{serial number ID of starting token. This number
18
#' corresponds with the number of \code{data.frame} returned from
19
#' \code{spacy_tokenize(x)} with default options.}
20
#' \item{\code{root_id}}{serial number ID of root token}
21
#' \item{\code{length}}{number of words (tokens) included in a noun-phrase (e.g.
22
#' for a noun-phrase, "individual car owners", \code{length = 3})}}
23
#' 
24
#' @return either a \code{list} or \code{data.frame} of tokens
25
#' @export
26
#' @examples
27
#' \donttest{
28
#' spacy_initialize()
29
#' 
30
#' txt <- c(doc1 = "Natural language processing is a branch of computer science.",
31
#'          doc2 = "Paul earned a postgraduate degree from MIT.")
32
#' spacy_extract_nounphrases(txt)
33
#' spacy_extract_nounphrases(txt, output = "list")
34
#' }
35
spacy_extract_nounphrases <- function(x, output = c("data.frame", "list"),
36
                                      multithread = TRUE, ...) {
37 2
    UseMethod("spacy_extract_nounphrases")
38
}
39

40

41
#' @importFrom data.table data.table
42
#' @export
43
spacy_extract_nounphrases.character <- function(x,
44
                                                output = c("data.frame", "list"),
45
                                                multithread = TRUE, ...) {
46

47 2
    `root_id` <- `start_id` <- `:=` <- NULL
48

49 2
    output <- match.arg(output)
50

51 2
    if (!is.null(names(x))) {
52 2
        docnames <- names(x)
53
    } else {
54 2
        docnames <- paste0("text", 1:length(x))
55
    }
56 2
    if (length(x) == 1) {
57 2
        multithread <- FALSE
58
    }
59

60 2
    if (all(!duplicated(docnames)) == FALSE) {
61 2
        stop("Docnames are duplicated.")
62 2
    } else if (all(nchar(docnames) > 0L) == FALSE) {
63 2
        stop("Some docnames are missing.")
64
    }
65

66 0
    if (is.null(options()$spacy_initialized)) spacy_initialize()
67 2
    spacyr_pyexec("try:\n del spobj\nexcept NameError:\n 1")
68 2
    spacyr_pyexec("texts = []")
69

70 2
    if (spacyr_pyget("py_version") != 3) {
71 0
        message("multithreading for python 2 is not supported by spacy_tokenize()")
72 0
        multithread <- FALSE
73
    }
74

75

76 2
    x <- gsub("\\\\n", "\\\n", x) # replace two quotes \\n with \n
77 2
    x <- gsub("\\\\t", "\\\t", x) # replace two quotes \\t with \t
78 2
    x <- gsub("\\\\", "", x) # delete unnecessary backslashes
79 2
    x <- unname(x)
80

81
    ## send documents to python
82 2
    spacyr_pyassign("texts", x)
83 2
    spacyr_pyassign("docnames", docnames)
84 2
    spacyr_pyassign("multithread", multithread)
85

86

87
    ## run noun phrase extraction
88 2
    spacyr_pyexec("spobj = spacyr()")
89 2
    if (identical(output, "list")) {
90 2
        command_str <- paste("noun_phrases = spobj.extract_nounphrases_list(texts = texts,",
91 2
                             "docnames = docnames,",
92 2
                             "multithread = multithread)")
93 2
        spacyr_pyexec(command_str)
94 2
        return(spacyr_pyget("noun_phrases"))
95
    } else {
96 2
        command_str <- paste("noun_phrases = spobj.extract_nounphrases_dataframe(texts = texts,",
97 2
                             "docnames = docnames,",
98 2
                             "multithread = multithread)")
99 2
        spacyr_pyexec(command_str)
100 2
        noun_phrases <- spacyr_pyget("noun_phrases")
101

102 2
        doc_id <- names(noun_phrases)
103 2
        data_out <-
104 2
            data.table::rbindlist(lapply(doc_id, function(x) {
105 2
                df <- as.data.frame(noun_phrases[[x]], stringsAsFactors = FALSE)
106 2
                if (nrow(df) == 0) return(NULL)
107 2
                df$doc_id <- x
108 2
                return(df)
109
            }))
110 2
        if (nrow(data_out) == 0) {
111 2
            message("No noun phrase found in documents")
112 2
            return(NULL)
113
        }
114

115 2
        data_out[, start_id := start_id + 1][, root_id := root_id + 1]
116 2
        data.table::setDF(data_out)
117 2
        data_out <- data_out[, c(6, 1:5)]
118 2
        return(data_out)
119
    }
120
}
121

122

123
#' @export
124
spacy_extract_nounphrases.data.frame <- function(x, ...) {
125

126
    # insert compliance check here - replace with tif package
127 2
    if (!all(c("doc_id", "text") %in% names(x)))
128 2
        stop("input data.frame does not conform to the TIF standard")
129

130 2
    txt <- x$text
131 2
    names(txt) <- x$doc_id
132 2
    spacy_extract_nounphrases(txt, ...)
133
}

Read our documentation on viewing source code .

Loading