scrapy / scrapy
1
"""
2
This module implements the XMLFeedSpider which is the recommended spider to use
3
for scraping from an XML feed.
4

5
See documentation in docs/topics/spiders.rst
6
"""
7 7
from scrapy.spiders import Spider
8 7
from scrapy.utils.iterators import xmliter, csviter
9 7
from scrapy.utils.spider import iterate_spider_output
10 7
from scrapy.selector import Selector
11 7
from scrapy.exceptions import NotConfigured, NotSupported
12

13

14 7
class XMLFeedSpider(Spider):
15
    """
16
    This class intends to be the base class for spiders that scrape
17
    from XML feeds.
18

19
    You can choose whether to parse the file using the 'iternodes' iterator, an
20
    'xml' selector, or an 'html' selector.  In most cases, it's convenient to
21
    use iternodes, since it's a faster and cleaner.
22
    """
23

24 7
    iterator = 'iternodes'
25 7
    itertag = 'item'
26 7
    namespaces = ()
27

28 7
    def process_results(self, response, results):
29
        """This overridable method is called for each result (item or request)
30
        returned by the spider, and it's intended to perform any last time
31
        processing required before returning the results to the framework core,
32
        for example setting the item GUIDs. It receives a list of results and
33
        the response which originated that results. It must return a list of
34
        results (items or requests).
35
        """
36 7
        return results
37

38 7
    def adapt_response(self, response):
39
        """You can override this function in order to make any changes you want
40
        to into the feed before parsing it. This function must return a
41
        response.
42
        """
43 7
        return response
44

45 7
    def parse_node(self, response, selector):
46
        """This method must be overriden with your custom spider functionality"""
47 7
        if hasattr(self, 'parse_item'):  # backward compatibility
48 0
            return self.parse_item(response, selector)
49 0
        raise NotImplementedError
50

51 7
    def parse_nodes(self, response, nodes):
52
        """This method is called for the nodes matching the provided tag name
53
        (itertag). Receives the response and an Selector for each node.
54
        Overriding this method is mandatory. Otherwise, you spider won't work.
55
        This method must return either an item, a request, or a list
56
        containing any of them.
57
        """
58

59 7
        for selector in nodes:
60 7
            ret = iterate_spider_output(self.parse_node(response, selector))
61 7
            for result_item in self.process_results(response, ret):
62 7
                yield result_item
63

64 7
    def _parse(self, response, **kwargs):
65 7
        if not hasattr(self, 'parse_node'):
66 0
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
67

68 7
        response = self.adapt_response(response)
69 7
        if self.iterator == 'iternodes':
70 7
            nodes = self._iternodes(response)
71 7
        elif self.iterator == 'xml':
72 7
            selector = Selector(response, type='xml')
73 7
            self._register_namespaces(selector)
74 7
            nodes = selector.xpath(f'//{self.itertag}')
75 7
        elif self.iterator == 'html':
76 0
            selector = Selector(response, type='html')
77 0
            self._register_namespaces(selector)
78 0
            nodes = selector.xpath(f'//{self.itertag}')
79
        else:
80 0
            raise NotSupported('Unsupported node iterator')
81

82 7
        return self.parse_nodes(response, nodes)
83

84 7
    def _iternodes(self, response):
85 7
        for node in xmliter(response, self.itertag):
86 7
            self._register_namespaces(node)
87 7
            yield node
88

89 7
    def _register_namespaces(self, selector):
90 7
        for (prefix, uri) in self.namespaces:
91 7
            selector.register_namespace(prefix, uri)
92

93

94 7
class CSVFeedSpider(Spider):
95
    """Spider for parsing CSV feeds.
96
    It receives a CSV file in a response; iterates through each of its rows,
97
    and calls parse_row with a dict containing each field's data.
98

99
    You can set some options regarding the CSV file, such as the delimiter, quotechar
100
    and the file's headers.
101
    """
102

103 7
    delimiter = None  # When this is None, python's csv module's default delimiter is used
104 7
    quotechar = None  # When this is None, python's csv module's default quotechar is used
105 7
    headers = None
106

107 7
    def process_results(self, response, results):
108
        """This method has the same purpose as the one in XMLFeedSpider"""
109 0
        return results
110

111 7
    def adapt_response(self, response):
112
        """This method has the same purpose as the one in XMLFeedSpider"""
113 0
        return response
114

115 7
    def parse_row(self, response, row):
116
        """This method must be overriden with your custom spider functionality"""
117 0
        raise NotImplementedError
118

119 7
    def parse_rows(self, response):
120
        """Receives a response and a dict (representing each row) with a key for
121
        each provided (or detected) header of the CSV file.  This spider also
122
        gives the opportunity to override adapt_response and
123
        process_results methods for pre and post-processing purposes.
124
        """
125

126 7
        for row in csviter(response, self.delimiter, self.headers, self.quotechar):
127 0
            ret = iterate_spider_output(self.parse_row(response, row))
128 7
            for result_item in self.process_results(response, ret):
129 0
                yield result_item
130

131 7
    def _parse(self, response, **kwargs):
132 7
        if not hasattr(self, 'parse_row'):
133 0
            raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
134 0
        response = self.adapt_response(response)
135 0
        return self.parse_rows(response)

Read our documentation on viewing source code .

Loading