scrapy / scrapy

@@ -0,0 +1,112 @@
Loading
1 +
import sys
2 +
import logging
3 +
from abc import ABCMeta, abstractmethod
4 +
from six import with_metaclass
5 +
6 +
from scrapy.utils.python import to_native_str, to_unicode
7 +
8 +
logger = logging.getLogger(__name__)
9 +
10 +
11 +
class RobotParser(with_metaclass(ABCMeta)):
12 +
    @classmethod
13 +
    @abstractmethod
14 +
    def from_crawler(cls, crawler, robotstxt_body):
15 +
        """Parse the content of a robots.txt_ file as bytes. This must be a class method.
16 +
        It must return a new instance of the parser backend.
17 +
18 +
        :param crawler: crawler which made the request
19 +
        :type crawler: :class:`~scrapy.crawler.Crawler` instance
20 +
21 +
        :param robotstxt_body: content of a robots.txt_ file.
22 +
        :type robotstxt_body: bytes
23 +
        """
24 +
        pass
25 +
26 +
    @abstractmethod
27 +
    def allowed(self, url, user_agent):
28 +
        """Return ``True`` if  ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
29 +
30 +
        :param url: Absolute URL
31 +
        :type url: string
32 +
33 +
        :param user_agent: User agent
34 +
        :type user_agent: string
35 +
        """
36 +
        pass
37 +
38 +
39 +
class PythonRobotParser(RobotParser):
40 +
    def __init__(self, robotstxt_body, spider):
41 +
        from six.moves.urllib_robotparser import RobotFileParser
42 +
        self.spider = spider
43 +
        try:
44 +
            robotstxt_body = to_native_str(robotstxt_body)
45 +
        except UnicodeDecodeError:
46 +
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
47 +
            # Switch to 'allow all' state.
48 +
            logger.warning("Failure while parsing robots.txt using %(parser)s."
49 +
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
50 +
                           {'parser': "RobotFileParser"},
51 +
                           exc_info=sys.exc_info(),
52 +
                           extra={'spider': self.spider})
53 +
            robotstxt_body = ''
54 +
        self.rp = RobotFileParser()
55 +
        self.rp.parse(robotstxt_body.splitlines())
56 +
57 +
    @classmethod
58 +
    def from_crawler(cls, crawler, robotstxt_body):
59 +
        spider = None if not crawler else crawler.spider
60 +
        o = cls(robotstxt_body, spider)
61 +
        return o
62 +
63 +
    def allowed(self, url, user_agent):
64 +
        user_agent = to_native_str(user_agent)
65 +
        url = to_native_str(url)
66 +
        return self.rp.can_fetch(user_agent, url)
67 +
68 +
69 +
class ReppyRobotParser(RobotParser):
70 +
    def __init__(self, robotstxt_body, spider):
71 +
        from reppy.robots import Robots
72 +
        self.spider = spider
73 +
        self.rp = Robots.parse('', robotstxt_body)
74 +
75 +
    @classmethod
76 +
    def from_crawler(cls, crawler, robotstxt_body):
77 +
        spider = None if not crawler else crawler.spider
78 +
        o = cls(robotstxt_body, spider)
79 +
        return o
80 +
81 +
    def allowed(self, url, user_agent):
82 +
        return self.rp.allowed(url, user_agent)
83 +
84 +
85 +
class RerpRobotParser(RobotParser):
86 +
    def __init__(self, robotstxt_body, spider):
87 +
        from robotexclusionrulesparser import RobotExclusionRulesParser
88 +
        self.spider = spider
89 +
        self.rp = RobotExclusionRulesParser()
90 +
        try:
91 +
            robotstxt_body = robotstxt_body.decode('utf-8')
92 +
        except UnicodeDecodeError:
93 +
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
94 +
            # Switch to 'allow all' state.
95 +
            logger.warning("Failure while parsing robots.txt using %(parser)s."
96 +
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
97 +
                           {'parser': "RobotExclusionRulesParser"},
98 +
                           exc_info=sys.exc_info(),
99 +
                           extra={'spider': self.spider})
100 +
            robotstxt_body = ''
101 +
        self.rp.parse(robotstxt_body)
102 +
103 +
    @classmethod
104 +
    def from_crawler(cls, crawler, robotstxt_body):
105 +
        spider = None if not crawler else crawler.spider
106 +
        o = cls(robotstxt_body, spider)
107 +
        return o
108 +
109 +
    def allowed(self, url, user_agent):
110 +
        user_agent = to_unicode(user_agent)
111 +
        url = to_unicode(url)
112 +
        return self.rp.is_allowed(user_agent, url)

@@ -5,8 +5,8 @@
Loading
5 5
"""
6 6
7 7
import logging
8 -
9 -
from six.moves.urllib import robotparser
8 +
import sys
9 +
import re
10 10
11 11
from twisted.internet.defer import Deferred, maybeDeferred
12 12
from scrapy.exceptions import NotConfigured, IgnoreRequest
@@ -14,6 +14,7 @@
Loading
14 14
from scrapy.utils.httpobj import urlparse_cached
15 15
from scrapy.utils.log import failure_to_exc_info
16 16
from scrapy.utils.python import to_native_str
17 +
from scrapy.utils.misc import load_object
17 18
18 19
logger = logging.getLogger(__name__)
19 20
@@ -24,10 +25,13 @@
Loading
24 25
    def __init__(self, crawler):
25 26
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
26 27
            raise NotConfigured
27 -
28 +
        self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
28 29
        self.crawler = crawler
29 -
        self._useragent = crawler.settings.get('USER_AGENT')
30 30
        self._parsers = {}
31 +
        self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
32 +
33 +
        # check if parser dependencies are met, this should throw an error otherwise.
34 +
        self._parserimpl.from_crawler(self.crawler, b'')
31 35
32 36
    @classmethod
33 37
    def from_crawler(cls, crawler):
@@ -43,7 +47,8 @@
Loading
43 47
    def process_request_2(self, rp, request, spider):
44 48
        if rp is None:
45 49
            return
46 -
        if not rp.can_fetch(to_native_str(self._useragent), request.url):
50 +
        useragent = request.headers.get(b'User-Agent', self._default_useragent)
51 +
        if not rp.allowed(request.url, useragent):
47 52
            logger.debug("Forbidden by robots.txt: %(request)s",
48 53
                         {'request': request}, extra={'spider': spider})
49 54
            self.crawler.stats.inc_value('robotstxt/forbidden')
@@ -62,13 +67,14 @@
Loading
62 67
                meta={'dont_obey_robotstxt': True}
63 68
            )
64 69
            dfd = self.crawler.engine.download(robotsreq, spider)
65 -
            dfd.addCallback(self._parse_robots, netloc)
70 +
            dfd.addCallback(self._parse_robots, netloc, spider)
66 71
            dfd.addErrback(self._logerror, robotsreq, spider)
67 72
            dfd.addErrback(self._robots_error, netloc)
68 73
            self.crawler.stats.inc_value('robotstxt/request_count')
69 74
70 75
        if isinstance(self._parsers[netloc], Deferred):
71 76
            d = Deferred()
77 +
72 78
            def cb(result):
73 79
                d.callback(result)
74 80
                return result
@@ -85,27 +91,10 @@
Loading
85 91
                         extra={'spider': spider})
86 92
        return failure
87 93
88 -
    def _parse_robots(self, response, netloc):
94 +
    def _parse_robots(self, response, netloc, spider):
89 95
        self.crawler.stats.inc_value('robotstxt/response_count')
90 -
        self.crawler.stats.inc_value(
91 -
            'robotstxt/response_status_count/{}'.format(response.status))
92 -
        rp = robotparser.RobotFileParser(response.url)
93 -
        body = ''
94 -
        if hasattr(response, 'text'):
95 -
            body = response.text
96 -
        else:  # last effort try
97 -
            try:
98 -
                body = response.body.decode('utf-8')
99 -
            except UnicodeDecodeError:
100 -
                # If we found garbage, disregard it:,
101 -
                # but keep the lookup cached (in self._parsers)
102 -
                # Running rp.parse() will set rp state from
103 -
                # 'disallow all' to 'allow any'.
104 -
                self.crawler.stats.inc_value('robotstxt/unicode_error_count')
105 -
        # stdlib's robotparser expects native 'str' ;
106 -
        # with unicode input, non-ASCII encoded bytes decoding fails in Python2
107 -
        rp.parse(to_native_str(body).splitlines())
108 -
96 +
        self.crawler.stats.inc_value('robotstxt/response_status_count/{}'.format(response.status))
97 +
        rp = self._parserimpl.from_crawler(self.crawler, response.body)
109 98
        rp_dfd = self._parsers[netloc]
110 99
        self._parsers[netloc] = rp
111 100
        rp_dfd.callback(rp)

@@ -245,6 +245,7 @@
Loading
245 245
RETRY_PRIORITY_ADJUST = -1
246 246
247 247
ROBOTSTXT_OBEY = False
248 +
ROBOTSTXT_PARSER = 'scrapy.robotstxt.PythonRobotParser'
248 249
249 250
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
250 251
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
Files Coverage
scrapy 85.55%
Project Totals (166 files) 85.55%
6815.2
TRAVIS_PYTHON_VERSION=2.7
TRAVIS_OS_NAME=linux
TOXENV=jessie
6815.1
TRAVIS_PYTHON_VERSION=2.7
TRAVIS_OS_NAME=linux
TOXENV=py27
6815.6
TRAVIS_PYTHON_VERSION=3.5
TRAVIS_OS_NAME=linux
TOXENV=py35
6815.5
TRAVIS_PYTHON_VERSION=3.4
TRAVIS_OS_NAME=linux
TOXENV=py34
6815.10
TRAVIS_PYTHON_VERSION=3.7
TRAVIS_OS_NAME=linux
TOXENV=py37-extra-deps
6815.7
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
TOXENV=py36
1
comment:
2
  layout: "header, diff, tree"
3

4
coverage:
5
  status:
6
    project: false
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading