scrapy / scrapy
1
"""
2
HttpError Spider Middleware
3

4
See documentation in docs/topics/spider-middleware.rst
5
"""
6 7
import logging
7

8 7
from scrapy.exceptions import IgnoreRequest
9

10 7
logger = logging.getLogger(__name__)
11

12

13 7
class HttpError(IgnoreRequest):
14
    """A non-200 response was filtered"""
15

16 7
    def __init__(self, response, *args, **kwargs):
17 7
        self.response = response
18 7
        super().__init__(*args, **kwargs)
19

20

21 7
class HttpErrorMiddleware:
22

23 7
    @classmethod
24 4
    def from_crawler(cls, crawler):
25 7
        return cls(crawler.settings)
26

27 7
    def __init__(self, settings):
28 7
        self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
29 7
        self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
30

31 7
    def process_spider_input(self, response, spider):
32 7
        if 200 <= response.status < 300:  # common case
33 7
            return
34 7
        meta = response.meta
35 7
        if meta.get('handle_httpstatus_all', False):
36 7
            return
37 7
        if 'handle_httpstatus_list' in meta:
38 7
            allowed_statuses = meta['handle_httpstatus_list']
39 7
        elif self.handle_httpstatus_all:
40 7
            return
41
        else:
42 7
            allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
43 7
        if response.status in allowed_statuses:
44 7
            return
45 7
        raise HttpError(response, 'Ignoring non-200 response')
46

47 7
    def process_spider_exception(self, response, exception, spider):
48 7
        if isinstance(exception, HttpError):
49 7
            spider.crawler.stats.inc_value('httperror/response_ignored_count')
50 7
            spider.crawler.stats.inc_value(
51
                f'httperror/response_ignored_status_count/{response.status}'
52
            )
53 7
            logger.info(
54
                "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
55
                {'response': response}, extra={'spider': spider},
56
            )
57 7
            return []

Read our documentation on viewing source code .

Loading