1 2
from typing import List, Optional
2

3 2
import spacy
4

5 2
from snorkel.types import FieldMap, HashingFunction
6

7 2
from .core import BasePreprocessor, Preprocessor
8

9 2
EN_CORE_WEB_SM = "en_core_web_sm"
10

11

12 2
class SpacyPreprocessor(Preprocessor):
13
    """Preprocessor that parses input text via a SpaCy model.
14

15
    A common approach to writing LFs over text is to first use
16
    a natural language parser to decompose the text into tokens,
17
    part-of-speech tags, etc. SpaCy (https://spacy.io/) is a
18
    popular tool for doing this. This preprocessor adds a
19
    SpaCy ``Doc`` object to the data point. A ``Doc`` object is
20
    a sequence of ``Token`` objects, which contain information
21
    on lemmatization, parts-of-speech, etc. ``Doc`` objects also
22
    contain fields like ``Doc.ents``, a list of named entities,
23
    and ``Doc.noun_chunks``, a list of noun phrases. For details
24
    of SpaCy ``Doc`` objects and a full attribute listing,
25
    see https://spacy.io/api/doc.
26

27
    Parameters
28
    ----------
29
    text_field
30
        Name of data point text field to input
31
    doc_field
32
        Name of data point field to output parsed document to
33
    language
34
        SpaCy model to load
35
        See https://spacy.io/usage/models#usage
36
    disable
37
        List of pipeline components to disable
38
        See https://spacy.io/usage/processing-pipelines#disabling
39
    pre
40
        Preprocessors to run before this preprocessor is executed
41
    memoize
42
        Memoize preprocessor outputs?
43
    memoize_key
44
        Hashing function to handle the memoization (default to snorkel.map.core.get_hashable)
45
    gpu
46
        Prefer Spacy GPU processing?
47
    """
48

49 2
    def __init__(
50
        self,
51
        text_field: str,
52
        doc_field: str,
53
        language: str = EN_CORE_WEB_SM,
54
        disable: Optional[List[str]] = None,
55
        pre: Optional[List[BasePreprocessor]] = None,
56
        memoize: bool = False,
57
        memoize_key: Optional[HashingFunction] = None,
58
        gpu: bool = False,
59
    ) -> None:
60 2
        name = type(self).__name__
61 2
        super().__init__(
62
            name,
63
            field_names=dict(text=text_field),
64
            mapped_field_names=dict(doc=doc_field),
65
            pre=pre,
66
            memoize=memoize,
67
            memoize_key=memoize_key,
68
        )
69 2
        self.gpu = gpu
70 2
        if self.gpu:
71 0
            spacy.prefer_gpu()
72 2
        self._nlp = spacy.load(language, disable=disable or [])
73

74 2
    def run(self, text: str) -> FieldMap:  # type: ignore
75
        """Run the SpaCy model on input text.
76

77
        Parameters
78
        ----------
79
        text
80
            Text of document to parse
81

82
        Returns
83
        -------
84
        FieldMap
85
            Dictionary with a single key (``"doc"``), mapping to the
86
            parsed SpaCy ``Doc`` object
87
        """
88
        # Note: not trying to add the fields of `Doc` to top-level
89
        # as most are Cython property methods computed on the fly.
90 2
        return dict(doc=self._nlp(text))

Read our documentation on viewing source code .

Loading