Showing 94 of 217 files from the diff.
Other files ignored by Codecov
.gitignore has changed.
docs/faq.rst has changed.
pytest.ini has changed.
.travis.yml has changed.
docs/conf.py has changed.
setup.cfg has changed.
.bumpversion.cfg has changed.
README.rst has changed.
scrapy/VERSION has changed.
setup.py has changed.
conftest.py has changed.
tox.ini has changed.
docs/news.rst has changed.
appveyor.yml was deleted.
tests/spiders.py has changed.
pylintrc has changed.
docs/index.rst has changed.

@@ -91,7 +91,7 @@
Loading
91 91
        return text
92 92
    if not isinstance(text, (bytes, str)):
93 93
        raise TypeError('to_unicode must receive a bytes or str '
94 -
                        'object, got %s' % type(text).__name__)
94 +
                        f'object, got {type(text).__name__}')
95 95
    if encoding is None:
96 96
        encoding = 'utf-8'
97 97
    return text.decode(encoding, errors)
@@ -104,7 +104,7 @@
Loading
104 104
        return text
105 105
    if not isinstance(text, str):
106 106
        raise TypeError('to_bytes must receive a str or bytes '
107 -
                        'object, got %s' % type(text).__name__)
107 +
                        f'object, got {type(text).__name__}')
108 108
    if encoding is None:
109 109
        encoding = 'utf-8'
110 110
    return text.encode(encoding, errors)
@@ -174,7 +174,7 @@
Loading
174 174
    does not contain unprintable control characters.
175 175
    """
176 176
    if not isinstance(data, bytes):
177 -
        raise TypeError("data must be bytes, got '%s'" % type(data).__name__)
177 +
        raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
178 178
    return all(c not in _BINARYCHARS for c in data)
179 179
180 180
@@ -198,7 +198,8 @@
Loading
198 198
def get_func_args(func, stripself=False):
199 199
    """Return the argument name list of a callable"""
200 200
    if inspect.isfunction(func):
201 -
        func_args, _, _, _ = _getargspec_py23(func)
201 +
        spec = inspect.getfullargspec(func)
202 +
        func_args = spec.args + spec.kwonlyargs
202 203
    elif inspect.isclass(func):
203 204
        return get_func_args(func.__init__, True)
204 205
    elif inspect.ismethod(func):
@@ -216,7 +217,7 @@
Loading
216 217
        else:
217 218
            return get_func_args(func.__call__, True)
218 219
    else:
219 -
        raise TypeError('%s is not callable' % type(func))
220 +
        raise TypeError(f'{type(func)} is not callable')
220 221
    if stripself:
221 222
        func_args.pop(0)
222 223
    return func_args
@@ -249,7 +250,7 @@
Loading
249 250
    elif hasattr(func, '__call__'):
250 251
        spec = _getargspec_py23(func.__call__)
251 252
    else:
252 -
        raise TypeError('%s is not callable' % type(func))
253 +
        raise TypeError(f'{type(func)} is not callable')
253 254
254 255
    defaults = spec.defaults or []
255 256
@@ -321,7 +322,7 @@
Loading
321 322
    >>> global_object_name(Request)
322 323
    'scrapy.http.request.Request'
323 324
    """
324 -
    return "%s.%s" % (obj.__module__, obj.__name__)
325 +
    return f"{obj.__module__}.{obj.__name__}"
325 326
326 327
327 328
if hasattr(sys, "pypy_version_info"):

@@ -43,7 +43,7 @@
Loading
43 43
                    return False
44 44
                else:
45 45
                    if self.verbose_stats:
46 -
                        self.stats.inc_value('request_depth_count/%s' % depth,
46 +
                        self.stats.inc_value(f'request_depth_count/{depth}',
47 47
                                             spider=spider)
48 48
                    self.stats.max_value('request_depth_max', depth,
49 49
                                         spider=spider)

@@ -1,4 +1,5 @@
Loading
1 1
from email.utils import formatdate
2 +
from typing import Optional, Type, TypeVar
2 3
3 4
from twisted.internet import defer
4 5
from twisted.internet.error import (
@@ -13,10 +14,19 @@
Loading
13 14
from twisted.web.client import ResponseFailed
14 15
15 16
from scrapy import signals
17 +
from scrapy.crawler import Crawler
16 18
from scrapy.exceptions import IgnoreRequest, NotConfigured
19 +
from scrapy.http.request import Request
20 +
from scrapy.http.response import Response
21 +
from scrapy.settings import Settings
22 +
from scrapy.spiders import Spider
23 +
from scrapy.statscollectors import StatsCollector
17 24
from scrapy.utils.misc import load_object
18 25
19 26
27 +
HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
28 +
29 +
20 30
class HttpCacheMiddleware:
21 31
22 32
    DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
@@ -24,7 +34,7 @@
Loading
24 34
                           ConnectionLost, TCPTimedOutError, ResponseFailed,
25 35
                           IOError)
26 36
27 -
    def __init__(self, settings, stats):
37 +
    def __init__(self, settings: Settings, stats: StatsCollector) -> None:
28 38
        if not settings.getbool('HTTPCACHE_ENABLED'):
29 39
            raise NotConfigured
30 40
        self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
@@ -33,26 +43,26 @@
Loading
33 43
        self.stats = stats
34 44
35 45
    @classmethod
36 -
    def from_crawler(cls, crawler):
46 +
    def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
37 47
        o = cls(crawler.settings, crawler.stats)
38 48
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
39 49
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
40 50
        return o
41 51
42 -
    def spider_opened(self, spider):
52 +
    def spider_opened(self, spider: Spider) -> None:
43 53
        self.storage.open_spider(spider)
44 54
45 -
    def spider_closed(self, spider):
55 +
    def spider_closed(self, spider: Spider) -> None:
46 56
        self.storage.close_spider(spider)
47 57
48 -
    def process_request(self, request, spider):
58 +
    def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
49 59
        if request.meta.get('dont_cache', False):
50 -
            return
60 +
            return None
51 61
52 62
        # Skip uncacheable requests
53 63
        if not self.policy.should_cache_request(request):
54 64
            request.meta['_dont_cache'] = True  # flag as uncacheable
55 -
            return
65 +
            return None
56 66
57 67
        # Look for cached response and check if expired
58 68
        cachedresponse = self.storage.retrieve_response(spider, request)
@@ -61,7 +71,7 @@
Loading
61 71
            if self.ignore_missing:
62 72
                self.stats.inc_value('httpcache/ignore', spider=spider)
63 73
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
64 -
            return  # first time request
74 +
            return None  # first time request
65 75
66 76
        # Return cached response only if not expired
67 77
        cachedresponse.flags.append('cached')
@@ -73,7 +83,9 @@
Loading
73 83
        # process_response hook
74 84
        request.meta['cached_response'] = cachedresponse
75 85
76 -
    def process_response(self, request, response, spider):
86 +
        return None
87 +
88 +
    def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
77 89
        if request.meta.get('dont_cache', False):
78 90
            return response
79 91
@@ -85,7 +97,7 @@
Loading
85 97
        # RFC2616 requires origin server to set Date header,
86 98
        # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
87 99
        if 'Date' not in response.headers:
88 -
            response.headers['Date'] = formatdate(usegmt=1)
100 +
            response.headers['Date'] = formatdate(usegmt=True)
89 101
90 102
        # Do not validate first-hand responses
91 103
        cachedresponse = request.meta.pop('cached_response', None)
@@ -102,13 +114,18 @@
Loading
102 114
        self._cache_response(spider, response, request, cachedresponse)
103 115
        return response
104 116
105 -
    def process_exception(self, request, exception, spider):
117 +
    def process_exception(
118 +
        self, request: Request, exception: Exception, spider: Spider
119 +
    ) -> Optional[Response]:
106 120
        cachedresponse = request.meta.pop('cached_response', None)
107 121
        if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
108 122
            self.stats.inc_value('httpcache/errorrecovery', spider=spider)
109 123
            return cachedresponse
124 +
        return None
110 125
111 -
    def _cache_response(self, spider, response, request, cachedresponse):
126 +
    def _cache_response(
127 +
        self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
128 +
    ) -> None:
112 129
        if self.policy.should_cache_response(response, request):
113 130
            self.stats.inc_value('httpcache/store', spider=spider)
114 131
            self.storage.store_response(spider, request, response)

@@ -66,31 +66,25 @@
Loading
66 66
            print("Cannot create a spider with the same name as your project")
67 67
            return
68 68
69 -
        try:
70 -
            spidercls = self.crawler_process.spider_loader.load(name)
71 -
        except KeyError:
72 -
            pass
73 -
        else:
74 -
            # if spider already exists and not --force then halt
75 -
            if not opts.force:
76 -
                print("Spider %r already exists in module:" % name)
77 -
                print("  %s" % spidercls.__module__)
78 -
                return
69 +
        if not opts.force and self._spider_exists(name):
70 +
            return
71 +
79 72
        template_file = self._find_template(opts.template)
80 73
        if template_file:
81 74
            self._genspider(module, name, domain, opts.template, template_file)
82 75
            if opts.edit:
83 -
                self.exitcode = os.system('scrapy edit "%s"' % name)
76 +
                self.exitcode = os.system(f'scrapy edit "{name}"')
84 77
85 78
    def _genspider(self, module, name, domain, template_name, template_file):
86 79
        """Generate the spider module, based on the given template"""
80 +
        capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
87 81
        tvars = {
88 82
            'project_name': self.settings.get('BOT_NAME'),
89 83
            'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
90 84
            'module': module,
91 85
            'name': name,
92 86
            'domain': domain,
93 -
            'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
87 +
            'classname': f'{capitalized_module}Spider'
94 88
        }
95 89
        if self.settings.get('NEWSPIDER_MODULE'):
96 90
            spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
@@ -98,26 +92,54 @@
Loading
98 92
        else:
99 93
            spiders_module = None
100 94
            spiders_dir = "."
101 -
        spider_file = "%s.py" % join(spiders_dir, module)
95 +
        spider_file = f"{join(spiders_dir, module)}.py"
102 96
        shutil.copyfile(template_file, spider_file)
103 97
        render_templatefile(spider_file, **tvars)
104 -
        print("Created spider %r using template %r "
105 -
              % (name, template_name), end=('' if spiders_module else '\n'))
98 +
        print(f"Created spider {name!r} using template {template_name!r} ",
99 +
              end=('' if spiders_module else '\n'))
106 100
        if spiders_module:
107 -
            print("in module:\n  %s.%s" % (spiders_module.__name__, module))
101 +
            print(f"in module:\n  {spiders_module.__name__}.{module}")
108 102
109 103
    def _find_template(self, template):
110 -
        template_file = join(self.templates_dir, '%s.tmpl' % template)
104 +
        template_file = join(self.templates_dir, f'{template}.tmpl')
111 105
        if exists(template_file):
112 106
            return template_file
113 -
        print("Unable to find template: %s\n" % template)
107 +
        print(f"Unable to find template: {template}\n")
114 108
        print('Use "scrapy genspider --list" to see all available templates.')
115 109
116 110
    def _list_templates(self):
117 111
        print("Available templates:")
118 112
        for filename in sorted(os.listdir(self.templates_dir)):
119 113
            if filename.endswith('.tmpl'):
120 -
                print("  %s" % splitext(filename)[0])
114 +
                print(f"  {splitext(filename)[0]}")
115 +
116 +
    def _spider_exists(self, name):
117 +
        if not self.settings.get('NEWSPIDER_MODULE'):
118 +
            # if run as a standalone command and file with same filename already exists
119 +
            if exists(name + ".py"):
120 +
                print(f"{abspath(name + '.py')} already exists")
121 +
                return True
122 +
            return False
123 +
124 +
        try:
125 +
            spidercls = self.crawler_process.spider_loader.load(name)
126 +
        except KeyError:
127 +
            pass
128 +
        else:
129 +
            # if spider with same name exists
130 +
            print(f"Spider {name!r} already exists in module:")
131 +
            print(f"  {spidercls.__module__}")
132 +
            return True
133 +
134 +
        # a file with the same name exists in the target directory
135 +
        spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
136 +
        spiders_dir = dirname(spiders_module.__file__)
137 +
        spiders_dir_abs = abspath(spiders_dir)
138 +
        if exists(join(spiders_dir_abs, name + ".py")):
139 +
            print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
140 +
            return True
141 +
142 +
        return False
121 143
122 144
    @property
123 145
    def templates_dir(self):

@@ -60,8 +60,7 @@
Loading
60 60
61 61
        if len(self.args) not in [1, 2, 3]:
62 62
            raise ValueError(
63 -
                "Incorrect argument quantity: expected 1, 2 or 3, got %i"
64 -
                % len(self.args)
63 +
                f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
65 64
            )
66 65
        self.obj_name = self.args[0] or None
67 66
        self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
@@ -88,10 +87,9 @@
Loading
88 87
            if self.min_bound == self.max_bound:
89 88
                expected = self.min_bound
90 89
            else:
91 -
                expected = '%s..%s' % (self.min_bound, self.max_bound)
90 +
                expected = f'{self.min_bound}..{self.max_bound}'
92 91
93 -
            raise ContractFail("Returned %s %s, expected %s" %
94 -
                               (occurrences, self.obj_name, expected))
92 +
            raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
95 93
96 94
97 95
class ScrapesContract(Contract):
@@ -106,5 +104,5 @@
Loading
106 104
            if is_item(x):
107 105
                missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
108 106
                if missing:
109 -
                    missing_str = ", ".join(missing)
110 -
                    raise ContractFail("Missing fields: %s" % missing_str)
107 +
                    missing_fields = ", ".join(missing)
108 +
                    raise ContractFail(f"Missing fields: {missing_fields}")

@@ -61,15 +61,15 @@
Loading
61 61
                continue
62 62
            elif url_pattern.match(domain):
63 63
                message = ("allowed_domains accepts only domains, not URLs. "
64 -
                           "Ignoring URL entry %s in allowed_domains." % domain)
64 +
                           f"Ignoring URL entry {domain} in allowed_domains.")
65 65
                warnings.warn(message, URLWarning)
66 66
            elif port_pattern.search(domain):
67 67
                message = ("allowed_domains accepts only domains without ports. "
68 -
                           "Ignoring entry %s in allowed_domains." % domain)
68 +
                           f"Ignoring entry {domain} in allowed_domains.")
69 69
                warnings.warn(message, PortWarning)
70 70
            else:
71 71
                domains.append(re.escape(domain))
72 -
        regex = r'^(.*\.)?(%s)$' % '|'.join(domains)
72 +
        regex = fr'^(.*\.)?({"|".join(domains)})$'
73 73
        return re.compile(regex)
74 74
75 75
    def spider_opened(self, spider):

@@ -19,6 +19,8 @@
Loading
19 19
20 20
AJAXCRAWL_ENABLED = False
21 21
22 +
ASYNCIO_EVENT_LOOP = None
23 +
22 24
AUTOTHROTTLE_ENABLED = False
23 25
AUTOTHROTTLE_DEBUG = False
24 26
AUTOTHROTTLE_MAX_DELAY = 60.0
@@ -285,7 +287,7 @@
Loading
285 287
286 288
URLLENGTH_LIMIT = 2083
287 289
288 -
USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
290 +
USER_AGENT = f'Scrapy/{import_module("scrapy").__version__} (+https://scrapy.org)'
289 291
290 292
TELNETCONSOLE_ENABLED = 1
291 293
TELNETCONSOLE_PORT = [6023, 6073]

@@ -10,17 +10,7 @@
Loading
10 10
from importlib import import_module
11 11
from twisted.trial.unittest import SkipTest
12 12
13 -
from scrapy.exceptions import NotConfigured
14 -
from scrapy.utils.boto import is_botocore
15 -
16 -
17 -
def assert_aws_environ():
18 -
    """Asserts the current environment is suitable for running AWS testsi.
19 -
    Raises SkipTest with the reason if it's not.
20 -
    """
21 -
    skip_if_no_boto()
22 -
    if 'AWS_ACCESS_KEY_ID' not in os.environ:
23 -
        raise SkipTest("AWS keys not found")
13 +
from scrapy.utils.boto import is_botocore_available
24 14
25 15
26 16
def assert_gcs_environ():
@@ -29,30 +19,8 @@
Loading
29 19
30 20
31 21
def skip_if_no_boto():
32 -
    try:
33 -
        is_botocore()
34 -
    except NotConfigured as e:
35 -
        raise SkipTest(e)
36 -
37 -
38 -
def get_s3_content_and_delete(bucket, path, with_key=False):
39 -
    """ Get content from s3 key, and delete key afterwards.
40 -
    """
41 -
    if is_botocore():
42 -
        import botocore.session
43 -
        session = botocore.session.get_session()
44 -
        client = session.create_client('s3')
45 -
        key = client.get_object(Bucket=bucket, Key=path)
46 -
        content = key['Body'].read()
47 -
        client.delete_object(Bucket=bucket, Key=path)
48 -
    else:
49 -
        import boto
50 -
        # assuming boto=2.2.2
51 -
        bucket = boto.connect_s3().get_bucket(bucket, validate=False)
52 -
        key = bucket.get_key(path)
53 -
        content = key.get_contents_as_string()
54 -
        bucket.delete_key(path)
55 -
    return (content, key) if with_key else content
22 +
    if not is_botocore_available():
23 +
        raise SkipTest('missing botocore library')
56 24
57 25
58 26
def get_gcs_content_and_delete(bucket, path):
@@ -79,7 +47,7 @@
Loading
79 47
80 48
    def buffer_data(data):
81 49
        ftp_data.append(data)
82 -
    ftp.retrbinary('RETR %s' % path, buffer_data)
50 +
    ftp.retrbinary(f'RETR {path}', buffer_data)
83 51
    dirname, filename = split(path)
84 52
    ftp.cwd(dirname)
85 53
    ftp.delete(filename)

@@ -80,15 +80,15 @@
Loading
80 80
                            base_url=get_base_url(response))
81 81
    forms = root.xpath('//form')
82 82
    if not forms:
83 -
        raise ValueError("No <form> element found in %s" % response)
83 +
        raise ValueError(f"No <form> element found in {response}")
84 84
85 85
    if formname is not None:
86 -
        f = root.xpath('//form[@name="%s"]' % formname)
86 +
        f = root.xpath(f'//form[@name="{formname}"]')
87 87
        if f:
88 88
            return f[0]
89 89
90 90
    if formid is not None:
91 -
        f = root.xpath('//form[@id="%s"]' % formid)
91 +
        f = root.xpath(f'//form[@id="{formid}"]')
92 92
        if f:
93 93
            return f[0]
94 94
@@ -103,7 +103,7 @@
Loading
103 103
                el = el.getparent()
104 104
                if el is None:
105 105
                    break
106 -
        raise ValueError('No <form> element found with %s' % formxpath)
106 +
        raise ValueError(f'No <form> element found with {formxpath}')
107 107
108 108
    # If we get here, it means that either formname was None
109 109
    # or invalid
@@ -111,8 +111,7 @@
Loading
111 111
        try:
112 112
            form = forms[formnumber]
113 113
        except IndexError:
114 -
            raise IndexError("Form number %d not found in %s" %
115 -
                             (formnumber, response))
114 +
            raise IndexError(f"Form number {formnumber} not found in {response}")
116 115
        else:
117 116
            return form
118 117
@@ -205,12 +204,12 @@
Loading
205 204
206 205
    # We didn't find it, so now we build an XPath expression out of the other
207 206
    # arguments, because they can be used as such
208 -
    xpath = './/*' + ''.join('[@%s="%s"]' % c for c in clickdata.items())
207 +
    xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
209 208
    el = form.xpath(xpath)
210 209
    if len(el) == 1:
211 210
        return (el[0].get('name'), el[0].get('value') or '')
212 211
    elif len(el) > 1:
213 -
        raise ValueError("Multiple elements found (%r) matching the criteria "
214 -
                         "in clickdata: %r" % (el, clickdata))
212 +
        raise ValueError(f"Multiple elements found ({el!r}) matching the "
213 +
                         f"criteria in clickdata: {clickdata!r}")
215 214
    else:
216 -
        raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
215 +
        raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')

@@ -124,18 +124,11 @@
Loading
124 124
            errback(failure.Failure(), *a, **kw)
125 125
126 126
127 -
def _isfuture(o):
128 -
    # workaround for Python before 3.5.3 not having asyncio.isfuture
129 -
    if hasattr(asyncio, 'isfuture'):
130 -
        return asyncio.isfuture(o)
131 -
    return isinstance(o, asyncio.Future)
132 -
133 -
134 127
def deferred_from_coro(o):
135 128
    """Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
136 129
    if isinstance(o, defer.Deferred):
137 130
        return o
138 -
    if _isfuture(o) or inspect.isawaitable(o):
131 +
    if asyncio.isfuture(o) or inspect.isawaitable(o):
139 132
        if not is_asyncio_reactor_installed():
140 133
            # wrapping the coroutine directly into a Deferred, this doesn't work correctly with coroutines
141 134
            # that use asyncio, e.g. "await asyncio.sleep(1)"
@@ -167,7 +160,7 @@
Loading
167 160
168 161
    if isinstance(result, defer.Deferred):
169 162
        return result
170 -
    elif _isfuture(result) or inspect.isawaitable(result):
163 +
    elif asyncio.isfuture(result) or inspect.isawaitable(result):
171 164
        return deferred_from_coro(result)
172 165
    elif isinstance(result, failure.Failure):
173 166
        return defer.fail(result)

@@ -140,7 +140,7 @@
Loading
140 140
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
141 141
        for k, v in sorted(self.vars.items()):
142 142
            if self._is_relevant(v):
143 -
                b.append("  %-10s %s" % (k, v))
143 +
                b.append(f"  {k:<10} {v}")
144 144
        b.append("Useful shortcuts:")
145 145
        if self.inthread:
146 146
            b.append("  fetch(url[, redirect=True]) "
@@ -150,7 +150,7 @@
Loading
150 150
        b.append("  shelp()           Shell help (print this help)")
151 151
        b.append("  view(response)    View response in a browser")
152 152
153 -
        return "\n".join("[s] %s" % line for line in b)
153 +
        return "\n".join(f"[s] {line}" for line in b)
154 154
155 155
    def _is_relevant(self, value):
156 156
        return isinstance(value, self.relevant_classes) or is_item(value)

@@ -30,4 +30,4 @@
Loading
30 30
        for cls, wdict in live_refs.items():
31 31
            if not wdict:
32 32
                continue
33 -
            self.stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict), spider=spider)
33 +
            self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)

@@ -11,7 +11,6 @@
Loading
11 11
import time
12 12
from collections import defaultdict
13 13
from contextlib import suppress
14 -
from email.utils import mktime_tz, parsedate_tz
15 14
from ftplib import FTP
16 15
from io import BytesIO
17 16
from urllib.parse import urlparse
@@ -23,7 +22,7 @@
Loading
23 22
from scrapy.http import Request
24 23
from scrapy.pipelines.media import MediaPipeline
25 24
from scrapy.settings import Settings
26 -
from scrapy.utils.boto import is_botocore
25 +
from scrapy.utils.boto import is_botocore_available
27 26
from scrapy.utils.datatypes import CaselessDict
28 27
from scrapy.utils.ftp import ftp_store_file
29 28
from scrapy.utils.log import failure_to_exc_info
@@ -91,86 +90,54 @@
Loading
91 90
    }
92 91
93 92
    def __init__(self, uri):
94 -
        self.is_botocore = is_botocore()
95 -
        if self.is_botocore:
96 -
            import botocore.session
97 -
            session = botocore.session.get_session()
98 -
            self.s3_client = session.create_client(
99 -
                's3',
100 -
                aws_access_key_id=self.AWS_ACCESS_KEY_ID,
101 -
                aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
102 -
                endpoint_url=self.AWS_ENDPOINT_URL,
103 -
                region_name=self.AWS_REGION_NAME,
104 -
                use_ssl=self.AWS_USE_SSL,
105 -
                verify=self.AWS_VERIFY
106 -
            )
107 -
        else:
108 -
            from boto.s3.connection import S3Connection
109 -
            self.S3Connection = S3Connection
93 +
        if not is_botocore_available():
94 +
            raise NotConfigured('missing botocore library')
95 +
        import botocore.session
96 +
        session = botocore.session.get_session()
97 +
        self.s3_client = session.create_client(
98 +
            's3',
99 +
            aws_access_key_id=self.AWS_ACCESS_KEY_ID,
100 +
            aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
101 +
            endpoint_url=self.AWS_ENDPOINT_URL,
102 +
            region_name=self.AWS_REGION_NAME,
103 +
            use_ssl=self.AWS_USE_SSL,
104 +
            verify=self.AWS_VERIFY
105 +
        )
110 106
        if not uri.startswith("s3://"):
111 -
            raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
107 +
            raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
112 108
        self.bucket, self.prefix = uri[5:].split('/', 1)
113 109
114 110
    def stat_file(self, path, info):
115 111
        def _onsuccess(boto_key):
116 -
            if self.is_botocore:
117 -
                checksum = boto_key['ETag'].strip('"')
118 -
                last_modified = boto_key['LastModified']
119 -
                modified_stamp = time.mktime(last_modified.timetuple())
120 -
            else:
121 -
                checksum = boto_key.etag.strip('"')
122 -
                last_modified = boto_key.last_modified
123 -
                modified_tuple = parsedate_tz(last_modified)
124 -
                modified_stamp = int(mktime_tz(modified_tuple))
112 +
            checksum = boto_key['ETag'].strip('"')
113 +
            last_modified = boto_key['LastModified']
114 +
            modified_stamp = time.mktime(last_modified.timetuple())
125 115
            return {'checksum': checksum, 'last_modified': modified_stamp}
126 116
127 117
        return self._get_boto_key(path).addCallback(_onsuccess)
128 118
129 -
    def _get_boto_bucket(self):
130 -
        # disable ssl (is_secure=False) because of this python bug:
131 -
        # https://bugs.python.org/issue5103
132 -
        c = self.S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False)
133 -
        return c.get_bucket(self.bucket, validate=False)
134 -
135 119
    def _get_boto_key(self, path):
136 -
        key_name = '%s%s' % (self.prefix, path)
137 -
        if self.is_botocore:
138 -
            return threads.deferToThread(
139 -
                self.s3_client.head_object,
140 -
                Bucket=self.bucket,
141 -
                Key=key_name)
142 -
        else:
143 -
            b = self._get_boto_bucket()
144 -
            return threads.deferToThread(b.get_key, key_name)
120 +
        key_name = f'{self.prefix}{path}'
121 +
        return threads.deferToThread(
122 +
            self.s3_client.head_object,
123 +
            Bucket=self.bucket,
124 +
            Key=key_name)
145 125
146 126
    def persist_file(self, path, buf, info, meta=None, headers=None):
147 127
        """Upload file to S3 storage"""
148 -
        key_name = '%s%s' % (self.prefix, path)
128 +
        key_name = f'{self.prefix}{path}'
149 129
        buf.seek(0)
150 -
        if self.is_botocore:
151 -
            extra = self._headers_to_botocore_kwargs(self.HEADERS)
152 -
            if headers:
153 -
                extra.update(self._headers_to_botocore_kwargs(headers))
154 -
            return threads.deferToThread(
155 -
                self.s3_client.put_object,
156 -
                Bucket=self.bucket,
157 -
                Key=key_name,
158 -
                Body=buf,
159 -
                Metadata={k: str(v) for k, v in (meta or {}).items()},
160 -
                ACL=self.POLICY,
161 -
                **extra)
162 -
        else:
163 -
            b = self._get_boto_bucket()
164 -
            k = b.new_key(key_name)
165 -
            if meta:
166 -
                for metakey, metavalue in meta.items():
167 -
                    k.set_metadata(metakey, str(metavalue))
168 -
            h = self.HEADERS.copy()
169 -
            if headers:
170 -
                h.update(headers)
171 -
            return threads.deferToThread(
172 -
                k.set_contents_from_string, buf.getvalue(),
173 -
                headers=h, policy=self.POLICY)
130 +
        extra = self._headers_to_botocore_kwargs(self.HEADERS)
131 +
        if headers:
132 +
            extra.update(self._headers_to_botocore_kwargs(headers))
133 +
        return threads.deferToThread(
134 +
            self.s3_client.put_object,
135 +
            Bucket=self.bucket,
136 +
            Key=key_name,
137 +
            Body=buf,
138 +
            Metadata={k: str(v) for k, v in (meta or {}).items()},
139 +
            ACL=self.POLICY,
140 +
            **extra)
174 141
175 142
    def _headers_to_botocore_kwargs(self, headers):
176 143
        """ Convert headers to botocore keyword agruments.
@@ -208,8 +175,7 @@
Loading
208 175
            try:
209 176
                kwarg = mapping[key]
210 177
            except KeyError:
211 -
                raise TypeError(
212 -
                    'Header "%s" is not supported by botocore' % key)
178 +
                raise TypeError(f'Header "{key}" is not supported by botocore')
213 179
            else:
214 180
                extra[kwarg] = value
215 181
        return extra
@@ -283,7 +249,7 @@
Loading
283 249
284 250
    def __init__(self, uri):
285 251
        if not uri.startswith("ftp://"):
286 -
            raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
252 +
            raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
287 253
        u = urlparse(uri)
288 254
        self.port = u.port
289 255
        self.host = u.hostname
@@ -293,7 +259,7 @@
Loading
293 259
        self.basedir = u.path.rstrip('/')
294 260
295 261
    def persist_file(self, path, buf, info, meta=None, headers=None):
296 -
        path = '%s/%s' % (self.basedir, path)
262 +
        path = f'{self.basedir}/{path}'
297 263
        return threads.deferToThread(
298 264
            ftp_store_file, path=path, file=buf,
299 265
            host=self.host, port=self.port, username=self.username,
@@ -308,10 +274,10 @@
Loading
308 274
                ftp.login(self.username, self.password)
309 275
                if self.USE_ACTIVE_MODE:
310 276
                    ftp.set_pasv(False)
311 -
                file_path = "%s/%s" % (self.basedir, path)
312 -
                last_modified = float(ftp.voidcmd("MDTM %s" % file_path)[4:].strip())
277 +
                file_path = f"{self.basedir}/{path}"
278 +
                last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
313 279
                m = hashlib.md5()
314 -
                ftp.retrbinary('RETR %s' % file_path, m.update)
280 +
                ftp.retrbinary(f'RETR {file_path}', m.update)
315 281
                return {'last_modified': last_modified, 'checksum': m.hexdigest()}
316 282
            # The file doesn't exist
317 283
            except Exception:
@@ -409,7 +375,7 @@
Loading
409 375
        store_cls = self.STORE_SCHEMES[scheme]
410 376
        return store_cls(uri)
411 377
412 -
    def media_to_download(self, request, info):
378 +
    def media_to_download(self, request, info, *, item=None):
413 379
        def _onsuccess(result):
414 380
            if not result:
415 381
                return  # returning None force download
@@ -436,7 +402,7 @@
Loading
436 402
            checksum = result.get('checksum', None)
437 403
            return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
438 404
439 -
        path = self.file_path(request, info=info)
405 +
        path = self.file_path(request, info=info, item=item)
440 406
        dfd = defer.maybeDeferred(self.store.stat_file, path, info)
441 407
        dfd.addCallbacks(_onsuccess, lambda _: None)
442 408
        dfd.addErrback(
@@ -460,7 +426,7 @@
Loading
460 426
461 427
        raise FileException
462 428
463 -
    def media_downloaded(self, response, request, info):
429 +
    def media_downloaded(self, response, request, info, *, item=None):
464 430
        referer = referer_str(request)
465 431
466 432
        if response.status != 200:
@@ -492,8 +458,8 @@
Loading
492 458
        self.inc_stats(info.spider, status)
493 459
494 460
        try:
495 -
            path = self.file_path(request, response=response, info=info)
496 -
            checksum = self.file_downloaded(response, request, info)
461 +
            path = self.file_path(request, response=response, info=info, item=item)
462 +
            checksum = self.file_downloaded(response, request, info, item=item)
497 463
        except FileException as exc:
498 464
            logger.warning(
499 465
                'File (error): Error processing file from %(request)s '
@@ -515,15 +481,15 @@
Loading
515 481
516 482
    def inc_stats(self, spider, status):
517 483
        spider.crawler.stats.inc_value('file_count', spider=spider)
518 -
        spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
484 +
        spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
519 485
520 486
    # Overridable Interface
521 487
    def get_media_requests(self, item, info):
522 488
        urls = ItemAdapter(item).get(self.files_urls_field, [])
523 489
        return [Request(u) for u in urls]
524 490
525 -
    def file_downloaded(self, response, request, info):
526 -
        path = self.file_path(request, response=response, info=info)
491 +
    def file_downloaded(self, response, request, info, *, item=None):
492 +
        path = self.file_path(request, response=response, info=info, item=item)
527 493
        buf = BytesIO(response.body)
528 494
        checksum = md5sum(buf)
529 495
        buf.seek(0)
@@ -535,7 +501,7 @@
Loading
535 501
            ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
536 502
        return item
537 503
538 -
    def file_path(self, request, response=None, info=None):
504 +
    def file_path(self, request, response=None, info=None, *, item=None):
539 505
        media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
540 506
        media_ext = os.path.splitext(request.url)[1]
541 507
        # Handles empty and wild extensions by trying to guess the
@@ -545,4 +511,4 @@
Loading
545 511
            media_type = mimetypes.guess_type(request.url)[0]
546 512
            if media_type:
547 513
                media_ext = mimetypes.guess_extension(media_type)
548 -
        return 'full/%s%s' % (media_guid, media_ext)
514 +
        return f'full/{media_guid}{media_ext}'

@@ -48,7 +48,7 @@
Loading
48 48
        if isinstance(exception, HttpError):
49 49
            spider.crawler.stats.inc_value('httperror/response_ignored_count')
50 50
            spider.crawler.stats.inc_value(
51 -
                'httperror/response_ignored_status_count/%s' % response.status
51 +
                f'httperror/response_ignored_status_count/{response.status}'
52 52
            )
53 53
            logger.info(
54 54
                "Ignoring response %(response)r: HTTP status code is not handled or not allowed",

@@ -20,7 +20,7 @@
Loading
20 20
        try:
21 21
            import_module(scrapy_module)
22 22
        except ImportError as exc:
23 -
            warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc))
23 +
            warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
24 24
        else:
25 25
            return True
26 26
    return bool(closest_scrapy_cfg())
@@ -90,7 +90,7 @@
Loading
90 90
        warnings.warn(
91 91
            'Use of environment variables prefixed with SCRAPY_ to override '
92 92
            'settings is deprecated. The following environment variables are '
93 -
            'currently defined: {}'.format(setting_envvar_list),
93 +
            f'currently defined: {setting_envvar_list}',
94 94
            ScrapyDeprecationWarning
95 95
        )
96 96
    settings.setdict(scrapy_envvars, priority='project')

@@ -141,17 +141,16 @@
Loading
141 141
142 142
    def __init__(self, crawler, downstream_queue_cls, key, slot_startprios=()):
143 143
        if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
144 -
            raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
145 -
                             % (self.__class__,))
144 +
            raise ValueError(f'"{self.__class__}" does not support CONCURRENT_REQUESTS_PER_IP')
146 145
147 146
        if slot_startprios and not isinstance(slot_startprios, dict):
148 147
            raise ValueError("DownloaderAwarePriorityQueue accepts "
149 -
                             "``slot_startprios`` as a dict; %r instance "
148 +
                             "``slot_startprios`` as a dict; "
149 +
                             f"{slot_startprios.__class__!r} instance "
150 150
                             "is passed. Most likely, it means the state is"
151 151
                             "created by an incompatible priority queue. "
152 152
                             "Only a crawl started with the same priority "
153 -
                             "queue class can be resumed." %
154 -
                             slot_startprios.__class__)
153 +
                             "queue class can be resumed.")
155 154
156 155
        self._downloader_interface = DownloaderInterface(crawler)
157 156
        self.downstream_queue_cls = downstream_queue_cls

@@ -1,3 +1,4 @@
Loading
1 +
import io
1 2
import zlib
2 3
3 4
from scrapy.utils.gz import gunzip
@@ -14,6 +15,12 @@
Loading
14 15
except ImportError:
15 16
    pass
16 17
18 +
try:
19 +
    import zstandard
20 +
    ACCEPTED_ENCODINGS.append(b'zstd')
21 +
except ImportError:
22 +
    pass
23 +
17 24
18 25
class HttpCompressionMiddleware:
19 26
    """This middleware allows compressed (gzip, deflate) traffic to be
@@ -67,4 +74,9 @@
Loading
67 74
                body = zlib.decompress(body, -15)
68 75
        if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
69 76
            body = brotli.decompress(body)
77 +
        if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
78 +
            # Using its streaming API since its simple API could handle only cases
79 +
            # where there is content size data embedded in the frame
80 +
            reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
81 +
            body = reader.read()
70 82
        return body

@@ -21,8 +21,8 @@
Loading
21 21
        for nl in nlist:
22 22
            args['n'] = nl
23 23
            argstr = urlencode(args, doseq=True)
24 -
            request.write("<a href='/follow?{0}'>follow {1}</a><br>"
25 -
                          .format(argstr, nl).encode('utf8'))
24 +
            request.write(f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
25 +
                          .encode('utf8'))
26 26
        request.write(b"</body></html>")
27 27
        return b''
28 28
@@ -39,6 +39,6 @@
Loading
39 39
40 40
    def _print_listening():
41 41
        httpHost = httpPort.getHost()
42 -
        print("Bench server at http://{}:{}".format(httpHost.host, httpHost.port))
42 +
        print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
43 43
    reactor.callWhenRunning(_print_listening)
44 44
    reactor.run()

@@ -19,10 +19,7 @@
Loading
19 19
20 20
21 21
def _fname(f):
22 -
    return "{}.{}".format(
23 -
        f.__self__.__class__.__name__,
24 -
        f.__func__.__name__
25 -
    )
22 +
    return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
26 23
27 24
28 25
class SpiderMiddlewareManager(MiddlewareManager):
@@ -51,8 +48,9 @@
Loading
51 48
                try:
52 49
                    result = method(response=response, spider=spider)
53 50
                    if result is not None:
54 -
                        msg = "Middleware {} must return None or raise an exception, got {}"
55 -
                        raise _InvalidOutput(msg.format(_fname(method), type(result)))
51 +
                        msg = (f"Middleware {_fname(method)} must return None "
52 +
                               f"or raise an exception, got {type(result)}")
53 +
                        raise _InvalidOutput(msg)
56 54
                except _InvalidOutput:
57 55
                    raise
58 56
                except Exception:
@@ -86,8 +84,9 @@
Loading
86 84
                elif result is None:
87 85
                    continue
88 86
                else:
89 -
                    msg = "Middleware {} must return None or an iterable, got {}"
90 -
                    raise _InvalidOutput(msg.format(_fname(method), type(result)))
87 +
                    msg = (f"Middleware {_fname(method)} must return None "
88 +
                           f"or an iterable, got {type(result)}")
89 +
                    raise _InvalidOutput(msg)
91 90
            return _failure
92 91
93 92
        def process_spider_output(result, start_index=0):
@@ -110,8 +109,9 @@
Loading
110 109
                if _isiterable(result):
111 110
                    result = _evaluate_iterable(result, method_index + 1, recovered)
112 111
                else:
113 -
                    msg = "Middleware {} must return an iterable, got {}"
114 -
                    raise _InvalidOutput(msg.format(_fname(method), type(result)))
112 +
                    msg = (f"Middleware {_fname(method)} must return an "
113 +
                           f"iterable, got {type(result)}")
114 +
                    raise _InvalidOutput(msg)
115 115
116 116
            return MutableChain(result, recovered)
117 117

@@ -103,12 +103,12 @@
Loading
103 103
        store_uri = settings['IMAGES_STORE']
104 104
        return cls(store_uri, settings=settings)
105 105
106 -
    def file_downloaded(self, response, request, info):
107 -
        return self.image_downloaded(response, request, info)
106 +
    def file_downloaded(self, response, request, info, *, item=None):
107 +
        return self.image_downloaded(response, request, info, item=item)
108 108
109 -
    def image_downloaded(self, response, request, info):
109 +
    def image_downloaded(self, response, request, info, *, item=None):
110 110
        checksum = None
111 -
        for path, image, buf in self.get_images(response, request, info):
111 +
        for path, image, buf in self.get_images(response, request, info, item=item):
112 112
            if checksum is None:
113 113
                buf.seek(0)
114 114
                checksum = md5sum(buf)
@@ -119,14 +119,15 @@
Loading
119 119
                headers={'Content-Type': 'image/jpeg'})
120 120
        return checksum
121 121
122 -
    def get_images(self, response, request, info):
123 -
        path = self.file_path(request, response=response, info=info)
122 +
    def get_images(self, response, request, info, *, item=None):
123 +
        path = self.file_path(request, response=response, info=info, item=item)
124 124
        orig_image = Image.open(BytesIO(response.body))
125 125
126 126
        width, height = orig_image.size
127 127
        if width < self.min_width or height < self.min_height:
128 -
            raise ImageException("Image too small (%dx%d < %dx%d)" %
129 -
                                 (width, height, self.min_width, self.min_height))
128 +
            raise ImageException("Image too small "
129 +
                                 f"({width}x{height} < "
130 +
                                 f"{self.min_width}x{self.min_height})")
130 131
131 132
        image, buf = self.convert_image(orig_image)
132 133
        yield path, image, buf
@@ -166,10 +167,10 @@
Loading
166 167
            ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
167 168
        return item
168 169
169 -
    def file_path(self, request, response=None, info=None):
170 +
    def file_path(self, request, response=None, info=None, *, item=None):
170 171
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
171 -
        return 'full/%s.jpg' % (image_guid)
172 +
        return f'full/{image_guid}.jpg'
172 173
173 174
    def thumb_path(self, request, thumb_id, response=None, info=None):
174 175
        thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
175 -
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
176 +
        return f'thumbs/{thumb_id}/{thumb_guid}.jpg'

@@ -71,25 +71,20 @@
Loading
71 71
72 72
73 73
def _find_method(obj, func):
74 -
    if obj:
75 -
        try:
76 -
            func_self = func.__self__
77 -
        except AttributeError:  # func has no __self__
78 -
            pass
79 -
        else:
80 -
            if func_self is obj:
81 -
                members = inspect.getmembers(obj, predicate=inspect.ismethod)
82 -
                for name, obj_func in members:
83 -
                    # We need to use __func__ to access the original
84 -
                    # function object because instance method objects
85 -
                    # are generated each time attribute is retrieved from
86 -
                    # instance.
87 -
                    #
88 -
                    # Reference: The standard type hierarchy
89 -
                    # https://docs.python.org/3/reference/datamodel.html
90 -
                    if obj_func.__func__ is func.__func__:
91 -
                        return name
92 -
    raise ValueError("Function %s is not a method of: %s" % (func, obj))
74 +
    # Only instance methods contain ``__func__``
75 +
    if obj and hasattr(func, '__func__'):
76 +
        members = inspect.getmembers(obj, predicate=inspect.ismethod)
77 +
        for name, obj_func in members:
78 +
            # We need to use __func__ to access the original
79 +
            # function object because instance method objects
80 +
            # are generated each time attribute is retrieved from
81 +
            # instance.
82 +
            #
83 +
            # Reference: The standard type hierarchy
84 +
            # https://docs.python.org/3/reference/datamodel.html
85 +
            if obj_func.__func__ is func.__func__:
86 +
                return name
87 +
    raise ValueError(f"Function {func} is not an instance method in: {obj}")
93 88
94 89
95 90
def _get_method(obj, name):
@@ -97,4 +92,4 @@
Loading
97 92
    try:
98 93
        return getattr(obj, name)
99 94
    except AttributeError:
100 -
        raise ValueError("Method %r not found in: %s" % (name, obj))
95 +
        raise ValueError(f"Method {name!r} not found in: {obj}")

@@ -2,41 +2,20 @@
Loading
2 2
3 3
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
4 4
from scrapy.exceptions import NotConfigured
5 -
from scrapy.utils.boto import is_botocore
5 +
from scrapy.utils.boto import is_botocore_available
6 6
from scrapy.utils.httpobj import urlparse_cached
7 7
from scrapy.utils.misc import create_instance
8 8
9 9
10 -
def _get_boto_connection():
11 -
    from boto.s3.connection import S3Connection
12 -
13 -
    class _v19_S3Connection(S3Connection):
14 -
        """A dummy S3Connection wrapper that doesn't do any synchronous download"""
15 -
        def _mexe(self, method, bucket, key, headers, *args, **kwargs):
16 -
            return headers
17 -
18 -
    class _v20_S3Connection(S3Connection):
19 -
        """A dummy S3Connection wrapper that doesn't do any synchronous download"""
20 -
        def _mexe(self, http_request, *args, **kwargs):
21 -
            http_request.authorize(connection=self)
22 -
            return http_request.headers
23 -
24 -
    try:
25 -
        import boto.auth  # noqa: F401
26 -
    except ImportError:
27 -
        _S3Connection = _v19_S3Connection
28 -
    else:
29 -
        _S3Connection = _v20_S3Connection
30 -
31 -
    return _S3Connection
32 -
33 -
34 10
class S3DownloadHandler:
35 11
36 12
    def __init__(self, settings, *,
37 13
                 crawler=None,
38 14
                 aws_access_key_id=None, aws_secret_access_key=None,
39 15
                 httpdownloadhandler=HTTPDownloadHandler, **kw):
16 +
        if not is_botocore_available():
17 +
            raise NotConfigured('missing botocore library')
18 +
40 19
        if not aws_access_key_id:
41 20
            aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
42 21
        if not aws_secret_access_key:
@@ -51,23 +30,15 @@
Loading
51 30
        self.anon = kw.get('anon')
52 31
53 32
        self._signer = None
54 -
        if is_botocore():
55 -
            import botocore.auth
56 -
            import botocore.credentials
57 -
            kw.pop('anon', None)
58 -
            if kw:
59 -
                raise TypeError('Unexpected keyword arguments: %s' % kw)
60 -
            if not self.anon:
61 -
                SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
62 -
                self._signer = SignerCls(botocore.credentials.Credentials(
63 -
                    aws_access_key_id, aws_secret_access_key))
64 -
        else:
65 -
            _S3Connection = _get_boto_connection()
66 -
            try:
67 -
                self.conn = _S3Connection(
68 -
                    aws_access_key_id, aws_secret_access_key, **kw)
69 -
            except Exception as ex:
70 -
                raise NotConfigured(str(ex))
33 +
        import botocore.auth
34 +
        import botocore.credentials
35 +
        kw.pop('anon', None)
36 +
        if kw:
37 +
            raise TypeError(f'Unexpected keyword arguments: {kw}')
38 +
        if not self.anon:
39 +
            SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
40 +
            self._signer = SignerCls(botocore.credentials.Credentials(
41 +
                aws_access_key_id, aws_secret_access_key))
71 42
72 43
        _http_handler = create_instance(
73 44
            objcls=httpdownloadhandler,
@@ -85,14 +56,14 @@
Loading
85 56
        scheme = 'https' if request.meta.get('is_secure') else 'http'
86 57
        bucket = p.hostname
87 58
        path = p.path + '?' + p.query if p.query else p.path
88 -
        url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
59 +
        url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
89 60
        if self.anon:
90 61
            request = request.replace(url=url)
91 62
        elif self._signer is not None:
92 63
            import botocore.awsrequest
93 64
            awsrequest = botocore.awsrequest.AWSRequest(
94 65
                method=request.method,
95 -
                url='%s://s3.amazonaws.com/%s%s' % (scheme, bucket, path),
66 +
                url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
96 67
                headers=request.headers.to_unicode_dict(),
97 68
                data=request.body)
98 69
            self._signer.add_auth(awsrequest)

@@ -39,7 +39,7 @@
Loading
39 39
    """Return status code plus status text descriptive message
40 40
    """
41 41
    message = http.RESPONSES.get(int(status), "Unknown Status")
42 -
    return '%s %s' % (status, to_unicode(message))
42 +
    return f'{status} {to_unicode(message)}'
43 43
44 44
45 45
def response_httprepr(response):
@@ -69,15 +69,15 @@
Loading
69 69
    body = response.body
70 70
    if isinstance(response, HtmlResponse):
71 71
        if b'<base' not in body:
72 -
            repl = '<head><base href="%s">' % response.url
72 +
            repl = f'<head><base href="{response.url}">'
73 73
            body = body.replace(b'<head>', to_bytes(repl))
74 74
        ext = '.html'
75 75
    elif isinstance(response, TextResponse):
76 76
        ext = '.txt'
77 77
    else:
78 -
        raise TypeError("Unsupported response type: %s" %
79 -
                        response.__class__.__name__)
78 +
        raise TypeError("Unsupported response type: "
79 +
                        f"{response.__class__.__name__}")
80 80
    fd, fname = tempfile.mkstemp(ext)
81 81
    os.write(fd, body)
82 82
    os.close(fd)
83 -
    return _openfunc("file://%s" % fname)
83 +
    return _openfunc(f"file://{fname}")

@@ -54,8 +54,8 @@
Loading
54 54
            cl = [to_unicode(c, errors='replace')
55 55
                  for c in request.headers.getlist('Cookie')]
56 56
            if cl:
57 -
                cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
58 -
                msg = "Sending cookies to: {}\n{}".format(request, cookies)
57 +
                cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
58 +
                msg = f"Sending cookies to: {request}\n{cookies}"
59 59
                logger.debug(msg, extra={'spider': spider})
60 60
61 61
    def _debug_set_cookie(self, response, spider):
@@ -63,8 +63,8 @@
Loading
63 63
            cl = [to_unicode(c, errors='replace')
64 64
                  for c in response.headers.getlist('Set-Cookie')]
65 65
            if cl:
66 -
                cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
67 -
                msg = "Received cookies from: {}\n{}".format(response, cookies)
66 +
                cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
67 +
                msg = f"Received cookies from: {response}\n{cookies}"
68 68
                logger.debug(msg, extra={'spider': spider})
69 69
70 70
    def _format_cookie(self, cookie, request):
@@ -74,7 +74,7 @@
Loading
74 74
        """
75 75
        decoded = {}
76 76
        for key in ("name", "value", "path", "domain"):
77 -
            if not cookie.get(key):
77 +
            if cookie.get(key) is None:
78 78
                if key in ("name", "value"):
79 79
                    msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
80 80
                    logger.warning(msg.format(request, cookie, key))
@@ -90,42 +90,21 @@
Loading
90 90
                                   request, cookie)
91 91
                    decoded[key] = cookie[key].decode("latin1", errors="replace")
92 92
93 -
        cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
93 +
        cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
94 94
        for key, value in decoded.items():  # path, domain
95 -
            cookie_str += "; {}={}".format(key.capitalize(), value)
95 +
            cookie_str += f"; {key.capitalize()}={value}"
96 96
        return cookie_str
97 97
98 98
    def _get_request_cookies(self, jar, request):
99 99
        """
100 -
        Extract cookies from a Request. Values from the `Request.cookies` attribute
101 -
        take precedence over values from the `Cookie` request header.
100 +
        Extract cookies from the Request.cookies attribute
102 101
        """
103 -
        def get_cookies_from_header(jar, request):
104 -
            cookie_header = request.headers.get("Cookie")
105 -
            if not cookie_header:
106 -
                return []
107 -
            cookie_gen_bytes = (s.strip() for s in cookie_header.split(b";"))
108 -
            cookie_list_unicode = []
109 -
            for cookie_bytes in cookie_gen_bytes:
110 -
                try:
111 -
                    cookie_unicode = cookie_bytes.decode("utf8")
112 -
                except UnicodeDecodeError:
113 -
                    logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
114 -
                                   request, cookie_bytes)
115 -
                    cookie_unicode = cookie_bytes.decode("latin1", errors="replace")
116 -
                cookie_list_unicode.append(cookie_unicode)
117 -
            response = Response(request.url, headers={"Set-Cookie": cookie_list_unicode})
118 -
            return jar.make_cookies(response, request)
119 -
120 -
        def get_cookies_from_attribute(jar, request):
121 -
            if not request.cookies:
122 -
                return []
123 -
            elif isinstance(request.cookies, dict):
124 -
                cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
125 -
            else:
126 -
                cookies = request.cookies
127 -
            formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
128 -
            response = Response(request.url, headers={"Set-Cookie": formatted})
129 -
            return jar.make_cookies(response, request)
130 -
131 -
        return get_cookies_from_header(jar, request) + get_cookies_from_attribute(jar, request)
102 +
        if not request.cookies:
103 +
            return []
104 +
        elif isinstance(request.cookies, dict):
105 +
            cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
106 +
        else:
107 +
            cookies = request.cookies
108 +
        formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
109 +
        response = Response(request.url, headers={"Set-Cookie": formatted})
110 +
        return jar.make_cookies(response, request)

@@ -40,7 +40,7 @@
Loading
40 40
        self.export_empty_fields = options.pop('export_empty_fields', False)
41 41
        self.indent = options.pop('indent', None)
42 42
        if not dont_fail and options:
43 -
            raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
43 +
            raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
44 44
45 45
    def export_item(self, item):
46 46
        raise NotImplementedError
@@ -208,7 +208,7 @@
Loading
208 208
209 209
class CsvItemExporter(BaseItemExporter):
210 210
211 -
    def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
211 +
    def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs):
212 212
        super().__init__(dont_fail=True, **kwargs)
213 213
        if not self.encoding:
214 214
            self.encoding = 'utf-8'
@@ -218,7 +218,8 @@
Loading
218 218
            line_buffering=False,
219 219
            write_through=True,
220 220
            encoding=self.encoding,
221 -
            newline=''  # Windows needs this https://github.com/scrapy/scrapy/issues/3034
221 +
            newline='',  # Windows needs this https://github.com/scrapy/scrapy/issues/3034
222 +
            errors=errors,
222 223
        )
223 224
        self.csv_writer = csv.writer(self.stream, **self._kwargs)
224 225
        self._headers_not_written = True

@@ -43,4 +43,4 @@
Loading
43 43
    def item_dropped(self, item, spider, exception):
44 44
        reason = exception.__class__.__name__
45 45
        self.stats.inc_value('item_dropped_count', spider=spider)
46 -
        self.stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
46 +
        self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)

@@ -41,17 +41,17 @@
Loading
41 41
42 42
    def __repr__(self):
43 43
        cls_name = self.__class__.__name__
44 -
        return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
45 -
            cls_name, self.concurrency, self.delay, self.randomize_delay)
44 +
        return (f"{cls_name}(concurrency={self.concurrency!r}, "
45 +
                f"delay={self.delay:.2f}, "
46 +
                f"randomize_delay={self.randomize_delay!r})")
46 47
47 48
    def __str__(self):
48 49
        return (
49 -
            "<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
50 -
            "len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
51 -
                self.concurrency, self.delay, self.randomize_delay,
52 -
                len(self.active), len(self.queue), len(self.transferring),
53 -
                datetime.fromtimestamp(self.lastseen).isoformat()
54 -
            )
50 +
            f"<downloader.Slot concurrency={self.concurrency!r} "
51 +
            f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
52 +
            f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
53 +
            f"len(transferring)={len(self.transferring)} "
54 +
            f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
55 55
        )
56 56
57 57

@@ -71,8 +71,7 @@
Loading
71 71
        scheme = urlparse_cached(request).scheme
72 72
        handler = self._get_handler(scheme)
73 73
        if not handler:
74 -
            raise NotSupported("Unsupported URL scheme '%s': %s" %
75 -
                               (scheme, self._notconfigured[scheme]))
74 +
            raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
76 75
        return handler.download_request(request, spider)
77 76
78 77
    @defer.inlineCallbacks

@@ -1,9 +1,9 @@
Loading
1 1
from time import time
2 2
from urllib.parse import urlparse, urlunparse, urldefrag
3 3
4 -
from twisted.web.client import HTTPClientFactory
5 4
from twisted.web.http import HTTPClient
6 -
from twisted.internet import defer
5 +
from twisted.internet import defer, reactor
6 +
from twisted.internet.protocol import ClientFactory
7 7
8 8
from scrapy.http import Headers
9 9
from scrapy.utils.httpobj import urlparse_cached
@@ -88,22 +88,38 @@
Loading
88 88
            self.transport.stopProducing()
89 89
90 90
        self.factory.noPage(
91 -
            defer.TimeoutError("Getting %s took longer than %s seconds."
92 -
                               % (self.factory.url, self.factory.timeout)))
91 +
            defer.TimeoutError(f"Getting {self.factory.url} took longer "
92 +
                               f"than {self.factory.timeout} seconds."))
93 93
94 94
95 -
class ScrapyHTTPClientFactory(HTTPClientFactory):
96 -
    """Scrapy implementation of the HTTPClientFactory overwriting the
97 -
    setUrl method to make use of our Url object that cache the parse
98 -
    result.
99 -
    """
95 +
# This class used to inherit from Twisted’s
96 +
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
97 +
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
98 +
# non-overriden code into this class.
99 +
class ScrapyHTTPClientFactory(ClientFactory):
100 100
101 101
    protocol = ScrapyHTTPPageGetter
102 +
102 103
    waiting = 1
103 104
    noisy = False
104 105
    followRedirect = False
105 106
    afterFoundGet = False
106 107
108 +
    def _build_response(self, body, request):
109 +
        request.meta['download_latency'] = self.headers_time - self.start_time
110 +
        status = int(self.status)
111 +
        headers = Headers(self.response_headers)
112 +
        respcls = responsetypes.from_args(headers=headers, url=self._url)
113 +
        return respcls(url=self._url, status=status, headers=headers, body=body)
114 +
115 +
    def _set_connection_attributes(self, request):
116 +
        parsed = urlparse_cached(request)
117 +
        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
118 +
        proxy = request.meta.get('proxy')
119 +
        if proxy:
120 +
            self.scheme, _, self.host, self.port, _ = _parse(proxy)
121 +
            self.path = self.url
122 +
107 123
    def __init__(self, request, timeout=180):
108 124
        self._url = urldefrag(request.url)[0]
109 125
        # converting to bytes to comply to Twisted interface
@@ -138,21 +154,59 @@
Loading
138 154
        elif self.method == b'POST':
139 155
            self.headers['Content-Length'] = 0
140 156
141 -
    def _build_response(self, body, request):
142 -
        request.meta['download_latency'] = self.headers_time - self.start_time
143 -
        status = int(self.status)
144 -
        headers = Headers(self.response_headers)
145 -
        respcls = responsetypes.from_args(headers=headers, url=self._url)
146 -
        return respcls(url=self._url, status=status, headers=headers, body=body)
157 +
    def __repr__(self):
158 +
        return f"<{self.__class__.__name__}: {self.url}>"
147 159
148 -
    def _set_connection_attributes(self, request):
149 -
        parsed = urlparse_cached(request)
150 -
        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
151 -
        proxy = request.meta.get('proxy')
152 -
        if proxy:
153 -
            self.scheme, _, self.host, self.port, _ = _parse(proxy)
154 -
            self.path = self.url
160 +
    def _cancelTimeout(self, result, timeoutCall):
161 +
        if timeoutCall.active():
162 +
            timeoutCall.cancel()
163 +
        return result
164 +
165 +
    def buildProtocol(self, addr):
166 +
        p = ClientFactory.buildProtocol(self, addr)
167 +
        p.followRedirect = self.followRedirect
168 +
        p.afterFoundGet = self.afterFoundGet
169 +
        if self.timeout:
170 +
            timeoutCall = reactor.callLater(self.timeout, p.timeout)
171 +
            self.deferred.addBoth(self._cancelTimeout, timeoutCall)
172 +
        return p
155 173
156 174
    def gotHeaders(self, headers):
157 175
        self.headers_time = time()
158 176
        self.response_headers = headers
177 +
178 +
    def gotStatus(self, version, status, message):
179 +
        """
180 +
        Set the status of the request on us.
181 +
        @param version: The HTTP version.
182 +
        @type version: L{bytes}
183 +
        @param status: The HTTP status code, an integer represented as a
184 +
            bytestring.
185 +
        @type status: L{bytes}
186 +
        @param message: The HTTP status message.
187 +
        @type message: L{bytes}
188 +
        """
189 +
        self.version, self.status, self.message = version, status, message
190 +
191 +
    def page(self, page):
192 +
        if self.waiting:
193 +
            self.waiting = 0
194 +
            self.deferred.callback(page)
195 +
196 +
    def noPage(self, reason):
197 +
        if self.waiting:
198 +
            self.waiting = 0
199 +
            self.deferred.errback(reason)
200 +
201 +
    def clientConnectionFailed(self, _, reason):
202 +
        """
203 +
        When a connection attempt fails, the request cannot be issued.  If no
204 +
        result has yet been provided to the result Deferred, provide the
205 +
        connection failure reason as an error result.
206 +
        """
207 +
        if self.waiting:
208 +
            self.waiting = 0
209 +
            # If the connection attempt failed, there is nothing more to
210 +
            # disconnect, so just fire that Deferred now.
211 +
            self._disconnectedDeferred.callback(None)
212 +
            self.deferred.errback(reason)

@@ -17,7 +17,7 @@
Loading
17 17
        plural = "s" if run != 1 else ""
18 18
19 19
        writeln(self.separator2)
20 -
        writeln("Ran %d contract%s in %.3fs" % (run, plural, stop - start))
20 +
        writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
21 21
        writeln()
22 22
23 23
        infos = []
@@ -25,14 +25,14 @@
Loading
25 25
            write("FAILED")
26 26
            failed, errored = map(len, (self.failures, self.errors))
27 27
            if failed:
28 -
                infos.append("failures=%d" % failed)
28 +
                infos.append(f"failures={failed}")
29 29
            if errored:
30 -
                infos.append("errors=%d" % errored)
30 +
                infos.append(f"errors={errored}")
31 31
        else:
32 32
            write("OK")
33 33
34 34
        if infos:
35 -
            writeln(" (%s)" % (", ".join(infos),))
35 +
            writeln(f" ({', '.join(infos)})")
36 36
        else:
37 37
            write("\n")
38 38
@@ -85,7 +85,7 @@
Loading
85 85
                        continue
86 86
                    print(spider)
87 87
                    for method in sorted(methods):
88 -
                        print('  * %s' % method)
88 +
                        print(f'  * {method}')
89 89
            else:
90 90
                start = time.time()
91 91
                self.crawler_process.start()

@@ -12,10 +12,12 @@
Loading
12 12
    content = string.Template(raw).substitute(**kwargs)
13 13
14 14
    render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
15 +
16 +
    if path.endswith('.tmpl'):
17 +
        os.rename(path, render_path)
18 +
15 19
    with open(render_path, 'wb') as fp:
16 20
        fp.write(content.encode('utf8'))
17 -
    if path.endswith('.tmpl'):
18 -
        os.remove(path)
19 21
20 22
21 23
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')

@@ -223,7 +223,7 @@
Loading
223 223
        self.db = None
224 224
225 225
    def open_spider(self, spider):
226 -
        dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
226 +
        dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
227 227
        self.db = self.dbmodule.open(dbpath, 'c')
228 228
229 229
        logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
@@ -251,13 +251,13 @@
Loading
251 251
            'headers': dict(response.headers),
252 252
            'body': response.body,
253 253
        }
254 -
        self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
255 -
        self.db['%s_time' % key] = str(time())
254 +
        self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
255 +
        self.db[f'{key}_time'] = str(time())
256 256
257 257
    def _read_data(self, spider, request):
258 258
        key = self._request_key(request)
259 259
        db = self.db
260 -
        tkey = '%s_time' % key
260 +
        tkey = f'{key}_time'
261 261
        if tkey not in db:
262 262
            return  # not found
263 263
@@ -265,7 +265,7 @@
Loading
265 265
        if 0 < self.expiration_secs < time() - float(ts):
266 266
            return  # expired
267 267
268 -
        return pickle.loads(db['%s_data' % key])
268 +
        return pickle.loads(db[f'{key}_data'])
269 269
270 270
    def _request_key(self, request):
271 271
        return request_fingerprint(request)

@@ -61,7 +61,7 @@
Loading
61 61
62 62
        if netloc not in self._parsers:
63 63
            self._parsers[netloc] = Deferred()
64 -
            robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
64 +
            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
65 65
            robotsreq = Request(
66 66
                robotsurl,
67 67
                priority=self.DOWNLOAD_PRIORITY,
@@ -94,7 +94,7 @@
Loading
94 94
95 95
    def _parse_robots(self, response, netloc, spider):
96 96
        self.crawler.stats.inc_value('robotstxt/response_count')
97 -
        self.crawler.stats.inc_value('robotstxt/response_status_count/{}'.format(response.status))
97 +
        self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
98 98
        rp = self._parserimpl.from_crawler(self.crawler, response.body)
99 99
        rp_dfd = self._parsers[netloc]
100 100
        self._parsers[netloc] = rp
@@ -102,7 +102,7 @@
Loading
102 102
103 103
    def _robots_error(self, failure, netloc):
104 104
        if failure.type is not IgnoreRequest:
105 -
            key = 'robotstxt/exception_count/{}'.format(failure.type)
105 +
            key = f'robotstxt/exception_count/{failure.type}'
106 106
            self.crawler.stats.inc_value(key)
107 107
        rp_dfd = self._parsers[netloc]
108 108
        self._parsers[netloc] = None

@@ -17,13 +17,13 @@
Loading
17 17
18 18
    def process_request(self, request, spider):
19 19
        self.stats.inc_value('downloader/request_count', spider=spider)
20 -
        self.stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
20 +
        self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
21 21
        reqlen = len(request_httprepr(request))
22 22
        self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
23 23
24 24
    def process_response(self, request, response, spider):
25 25
        self.stats.inc_value('downloader/response_count', spider=spider)
26 -
        self.stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
26 +
        self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
27 27
        reslen = len(response_httprepr(response))
28 28
        self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
29 29
        return response
@@ -31,4 +31,4 @@
Loading
31 31
    def process_exception(self, request, exception, spider):
32 32
        ex_class = global_object_name(exception.__class__)
33 33
        self.stats.inc_value('downloader/exception_count', spider=spider)
34 -
        self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
34 +
        self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)

@@ -112,8 +112,8 @@
Loading
112 112
    request_cls = None
113 113
114 114
    def __init__(self, method, *args):
115 -
        self.testcase_pre = _create_testcase(method, '@%s pre-hook' % self.name)
116 -
        self.testcase_post = _create_testcase(method, '@%s post-hook' % self.name)
115 +
        self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
116 +
        self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
117 117
        self.args = args
118 118
119 119
    def add_pre_hook(self, request, results):
@@ -172,8 +172,8 @@
Loading
172 172
173 173
    class ContractTestCase(TestCase):
174 174
        def __str__(_self):
175 -
            return "[%s] %s (%s)" % (spider, method.__name__, desc)
175 +
            return f"[{spider}] {method.__name__} ({desc})"
176 176
177 -
    name = '%s_%s' % (spider, method.__name__)
177 +
    name = f'{spider}_{method.__name__}'
178 178
    setattr(ContractTestCase, name, lambda x: x)
179 179
    return ContractTestCase(name)

@@ -19,22 +19,39 @@
Loading
19 19
20 20
from scrapy import signals
21 21
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
22 -
from scrapy.utils.boto import is_botocore
22 +
from scrapy.utils.boto import is_botocore_available
23 23
from scrapy.utils.conf import feed_complete_default_values_from_settings
24 24
from scrapy.utils.ftp import ftp_store_file
25 25
from scrapy.utils.log import failure_to_exc_info
26 26
from scrapy.utils.misc import create_instance, load_object
27 -
from scrapy.utils.python import without_none_values
27 +
from scrapy.utils.python import get_func_args, without_none_values
28 28
29 29
30 30
logger = logging.getLogger(__name__)
31 31
32 32
33 +
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
34 +
    argument_names = get_func_args(builder)
35 +
    if 'feed_options' in argument_names:
36 +
        kwargs['feed_options'] = feed_options
37 +
    else:
38 +
        warnings.warn(
39 +
            "{} does not support the 'feed_options' keyword argument. Add a "
40 +
            "'feed_options' parameter to its signature to remove this "
41 +
            "warning. This parameter will become mandatory in a future "
42 +
            "version of Scrapy."
43 +
            .format(builder.__qualname__),
44 +
            category=ScrapyDeprecationWarning
45 +
        )
46 +
    return builder(*preargs, uri, *args, **kwargs)
47 +
48 +
33 49
class IFeedStorage(Interface):
34 50
    """Interface that all Feed Storages must implement"""
35 51
36 -
    def __init__(uri):
37 -
        """Initialize the storage with the parameters given in the URI"""
52 +
    def __init__(uri, *, feed_options=None):
53 +
        """Initialize the storage with the parameters given in the URI and the
54 +
        feed-specific options (see :setting:`FEEDS`)"""
38 55
39 56
    def open(spider):
40 57
        """Open the storage for the given spider. It must return a file-like
@@ -64,10 +81,15 @@
Loading
64 81
@implementer(IFeedStorage)
65 82
class StdoutFeedStorage:
66 83
67 -
    def __init__(self, uri, _stdout=None):
84 +
    def __init__(self, uri, _stdout=None, *, feed_options=None):
68 85
        if not _stdout:
69 86
            _stdout = sys.stdout.buffer
70 87
        self._stdout = _stdout
88 +
        if feed_options and feed_options.get('overwrite', False) is True:
89 +
            logger.warning('Standard output (stdout) storage does not support '
90 +
                           'overwriting. To suppress this warning, remove the '
91 +
                           'overwrite option from your FEEDS setting, or set '
92 +
                           'it to False.')
71 93
72 94
    def open(self, spider):
73 95
        return self._stdout
@@ -79,14 +101,16 @@
Loading
79 101
@implementer(IFeedStorage)
80 102
class FileFeedStorage:
81 103
82 -
    def __init__(self, uri):
104 +
    def __init__(self, uri, *, feed_options=None):
83 105
        self.path = file_uri_to_path(uri)
106 +
        feed_options = feed_options or {}
107 +
        self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
84 108
85 109
    def open(self, spider):
86 110
        dirname = os.path.dirname(self.path)
87 111
        if dirname and not os.path.exists(dirname):
88 112
            os.makedirs(dirname)
89 -
        return open(self.path, 'ab')
113 +
        return open(self.path, self.write_mode)
90 114
91 115
    def store(self, file):
92 116
        file.close()
@@ -94,64 +118,44 @@
Loading
94 118
95 119
class S3FeedStorage(BlockingFeedStorage):
96 120
97 -
    def __init__(self, uri, access_key=None, secret_key=None, acl=None):
98 -
        # BEGIN Backward compatibility for initialising without keys (and
99 -
        # without using from_crawler)
100 -
        no_defaults = access_key is None and secret_key is None
101 -
        if no_defaults:
102 -
            from scrapy.utils.project import get_project_settings
103 -
            settings = get_project_settings()
104 -
            if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings:
105 -
                warnings.warn(
106 -
                    "Initialising `scrapy.extensions.feedexport.S3FeedStorage` "
107 -
                    "without AWS keys is deprecated. Please supply credentials or "
108 -
                    "use the `from_crawler()` constructor.",
109 -
                    category=ScrapyDeprecationWarning,
110 -
                    stacklevel=2
111 -
                )
112 -
                access_key = settings['AWS_ACCESS_KEY_ID']
113 -
                secret_key = settings['AWS_SECRET_ACCESS_KEY']
114 -
        # END Backward compatibility
121 +
    def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
122 +
                 feed_options=None):
123 +
        if not is_botocore_available():
124 +
            raise NotConfigured('missing botocore library')
115 125
        u = urlparse(uri)
116 126
        self.bucketname = u.hostname
117 127
        self.access_key = u.username or access_key
118 128
        self.secret_key = u.password or secret_key
119 -
        self.is_botocore = is_botocore()
120 129
        self.keyname = u.path[1:]  # remove first "/"
121 130
        self.acl = acl
122 -
        if self.is_botocore:
123 -
            import botocore.session
124 -
            session = botocore.session.get_session()
125 -
            self.s3_client = session.create_client(
126 -
                's3', aws_access_key_id=self.access_key,
127 -
                aws_secret_access_key=self.secret_key)
128 -
        else:
129 -
            import boto
130 -
            self.connect_s3 = boto.connect_s3
131 +
        import botocore.session
132 +
        session = botocore.session.get_session()
133 +
        self.s3_client = session.create_client(
134 +
            's3', aws_access_key_id=self.access_key,
135 +
            aws_secret_access_key=self.secret_key)
136 +
        if feed_options and feed_options.get('overwrite', True) is False:
137 +
            logger.warning('S3 does not support appending to files. To '
138 +
                           'suppress this warning, remove the overwrite '
139 +
                           'option from your FEEDS setting or set it to True.')
131 140
132 141
    @classmethod
133 -
    def from_crawler(cls, crawler, uri):
134 -
        return cls(
135 -
            uri=uri,
142 +
    def from_crawler(cls, crawler, uri, *, feed_options=None):
143 +
        return build_storage(
144 +
            cls,
145 +
            uri,
136 146
            access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
137 147
            secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
138 -
            acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None
148 +
            acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
149 +
            feed_options=feed_options,
139 150
        )
140 151
141 152
    def _store_in_thread(self, file):
142 153
        file.seek(0)
143 -
        if self.is_botocore:
144 -
            kwargs = {'ACL': self.acl} if self.acl else {}
145 -
            self.s3_client.put_object(
146 -
                Bucket=self.bucketname, Key=self.keyname, Body=file,
147 -
                **kwargs)
148 -
        else:
149 -
            conn = self.connect_s3(self.access_key, self.secret_key)
150 -
            bucket = conn.get_bucket(self.bucketname, validate=False)
151 -
            key = bucket.new_key(self.keyname)
152 -
            kwargs = {'policy': self.acl} if self.acl else {}
153 -
            key.set_contents_from_file(file, **kwargs)
154 -
            key.close()
154 +
        kwargs = {'ACL': self.acl} if self.acl else {}
155 +
        self.s3_client.put_object(
156 +
            Bucket=self.bucketname, Key=self.keyname, Body=file,
157 +
            **kwargs)
158 +
        file.close()
155 159
156 160
157 161
class GCSFeedStorage(BlockingFeedStorage):
@@ -182,27 +186,31 @@
Loading
182 186
183 187
class FTPFeedStorage(BlockingFeedStorage):
184 188
185 -
    def __init__(self, uri, use_active_mode=False):
189 +
    def __init__(self, uri, use_active_mode=False, *, feed_options=None):
186 190
        u = urlparse(uri)
187 191
        self.host = u.hostname
188 192
        self.port = int(u.port or '21')
189 193
        self.username = u.username
190 -
        self.password = unquote(u.password)
194 +
        self.password = unquote(u.password or '')
191 195
        self.path = u.path
192 196
        self.use_active_mode = use_active_mode
197 +
        self.overwrite = not feed_options or feed_options.get('overwrite', True)
193 198
194 199
    @classmethod
195 -
    def from_crawler(cls, crawler, uri):
196 -
        return cls(
197 -
            uri=uri,
198 -
            use_active_mode=crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE')
200 +
    def from_crawler(cls, crawler, uri, *, feed_options=None):
201 +
        return build_storage(
202 +
            cls,
203 +
            uri,
204 +
            crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
205 +
            feed_options=feed_options,
199 206
        )
200 207
201 208
    def _store_in_thread(self, file):
202 209
        ftp_store_file(
203 210
            path=self.path, file=file, host=self.host,
204 211
            port=self.port, username=self.username,
205 -
            password=self.password, use_active_mode=self.use_active_mode
212 +
            password=self.password, use_active_mode=self.use_active_mode,
213 +
            overwrite=self.overwrite,
206 214
        )
207 215
208 216
@@ -259,32 +267,32 @@
Loading
259 267
                category=ScrapyDeprecationWarning, stacklevel=2,
260 268
            )
261 269
            uri = str(self.settings['FEED_URI'])  # handle pathlib.Path objects
262 -
            feed = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
263 -
            self.feeds[uri] = feed_complete_default_values_from_settings(feed, self.settings)
270 +
            feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
271 +
            self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
264 272
        # End: Backward compatibility for FEED_URI and FEED_FORMAT settings
265 273
266 274
        # 'FEEDS' setting takes precedence over 'FEED_URI'
267 -
        for uri, feed in self.settings.getdict('FEEDS').items():
275 +
        for uri, feed_options in self.settings.getdict('FEEDS').items():
268 276
            uri = str(uri)  # handle pathlib.Path objects
269 -
            self.feeds[uri] = feed_complete_default_values_from_settings(feed, self.settings)
277 +
            self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
270 278
271 279
        self.storages = self._load_components('FEED_STORAGES')
272 280
        self.exporters = self._load_components('FEED_EXPORTERS')
273 -
        for uri, feed in self.feeds.items():
274 -
            if not self._storage_supported(uri):
281 +
        for uri, feed_options in self.feeds.items():
282 +
            if not self._storage_supported(uri, feed_options):
275 283
                raise NotConfigured
276 284
            if not self._settings_are_valid():
277 285
                raise NotConfigured
278 -
            if not self._exporter_supported(feed['format']):
286 +
            if not self._exporter_supported(feed_options['format']):
279 287
                raise NotConfigured
280 288
281 289
    def open_spider(self, spider):
282 -
        for uri, feed in self.feeds.items():
283 -
            uri_params = self._get_uri_params(spider, feed['uri_params'])
290 +
        for uri, feed_options in self.feeds.items():
291 +
            uri_params = self._get_uri_params(spider, feed_options['uri_params'])
284 292
            self.slots.append(self._start_new_batch(
285 293
                batch_id=1,
286 294
                uri=uri % uri_params,
287 -
                feed=feed,
295 +
                feed_options=feed_options,
288 296
                spider=spider,
289 297
                uri_template=uri,
290 298
            ))
@@ -311,44 +319,53 @@
Loading
311 319
        # Use `largs=log_args` to copy log_args into function's scope
312 320
        # instead of using `log_args` from the outer scope
313 321
        d.addCallback(
314 -
            lambda _, largs=log_args: logger.info(
315 -
                logfmt % "Stored", largs, extra={'spider': spider}
316 -
            )
322 +
            self._handle_store_success, log_args, logfmt, spider, type(slot.storage).__name__
317 323
        )
318 324
        d.addErrback(
319 -
            lambda f, largs=log_args: logger.error(
320 -
                logfmt % "Error storing", largs,
321 -
                exc_info=failure_to_exc_info(f), extra={'spider': spider}
322 -
            )
325 +
            self._handle_store_error, log_args, logfmt, spider, type(slot.storage).__name__
323 326
        )
324 327
        return d
325 328
326 -
    def _start_new_batch(self, batch_id, uri, feed, spider, uri_template):
329 +
    def _handle_store_error(self, f, largs, logfmt, spider, slot_type):
330 +
        logger.error(
331 +
            logfmt % "Error storing", largs,
332 +
            exc_info=failure_to_exc_info(f), extra={'spider': spider}
333 +
        )
334 +
        self.crawler.stats.inc_value(f"feedexport/failed_count/{slot_type}")
335 +
336 +
    def _handle_store_success(self, f, largs, logfmt, spider, slot_type):
337 +
        logger.info(
338 +
            logfmt % "Stored", largs, extra={'spider': spider}
339 +
        )
340 +
        self.crawler.stats.inc_value(f"feedexport/success_count/{slot_type}")
341 +
342 +
    def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
327 343
        """
328 344
        Redirect the output data stream to a new file.
329 345
        Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
330 346
        :param batch_id: sequence number of current batch
331 347
        :param uri: uri of the new batch to start
332 -
        :param feed: dict with parameters of feed
348 +
        :param feed_options: dict with parameters of feed
333 349
        :param spider: user spider
334 350
        :param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
335 351
        """
336 -
        storage = self._get_storage(uri)
352 +
        storage = self._get_storage(uri, feed_options)
337 353
        file = storage.open(spider)
338 354
        exporter = self._get_exporter(
339 355
            file=file,
340 -
            format=feed['format'],
341 -
            fields_to_export=feed['fields'],
342 -
            encoding=feed['encoding'],
343 -
            indent=feed['indent'],
356 +
            format=feed_options['format'],
357 +
            fields_to_export=feed_options['fields'],
358 +
            encoding=feed_options['encoding'],
359 +
            indent=feed_options['indent'],
360 +
            **feed_options['item_export_kwargs'],
344 361
        )
345 362
        slot = _FeedSlot(
346 363
            file=file,
347 364
            exporter=exporter,
348 365
            storage=storage,
349 366
            uri=uri,
350 -
            format=feed['format'],
351 -
            store_empty=feed['store_empty'],
367 +
            format=feed_options['format'],
368 +
            store_empty=feed_options['store_empty'],
352 369
            batch_id=batch_id,
353 370
            uri_template=uri_template,
354 371
        )
@@ -372,7 +389,7 @@
Loading
372 389
                slots.append(self._start_new_batch(
373 390
                    batch_id=slot.batch_id + 1,
374 391
                    uri=slot.uri_template % uri_params,
375 -
                    feed=self.feeds[slot.uri_template],
392 +
                    feed_options=self.feeds[slot.uri_template],
376 393
                    spider=spider,
377 394
                    uri_template=slot.uri_template,
378 395
                ))
@@ -411,11 +428,11 @@
Loading
411 428
                return False
412 429
        return True
413 430
414 -
    def _storage_supported(self, uri):
431 +
    def _storage_supported(self, uri, feed_options):
415 432
        scheme = urlparse(uri).scheme
416 433
        if scheme in self.storages:
417 434
            try:
418 -
                self._get_storage(uri)
435 +
                self._get_storage(uri, feed_options)
419 436
                return True
420 437
            except NotConfigured as e:
421 438
                logger.error("Disabled feed storage scheme: %(scheme)s. "
@@ -433,8 +450,30 @@
Loading
433 450
    def _get_exporter(self, file, format, *args, **kwargs):
434 451
        return self._get_instance(self.exporters[format], file, *args, **kwargs)
435 452
436 -
    def _get_storage(self, uri):
437 -
        return self._get_instance(self.storages[urlparse(uri).scheme], uri)
453 +
    def _get_storage(self, uri, feed_options):
454 +
        """Fork of create_instance specific to feed storage classes
455 +
456 +
        It supports not passing the *feed_options* parameters to classes that
457 +
        do not support it, and issuing a deprecation warning instead.
458 +
        """
459 +
        feedcls = self.storages[urlparse(uri).scheme]
460 +
        crawler = getattr(self, 'crawler', None)
461 +
462 +
        def build_instance(builder, *preargs):
463 +
            return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
464 +
465 +
        if crawler and hasattr(feedcls, 'from_crawler'):
466 +
            instance = build_instance(feedcls.from_crawler, crawler)
467 +
            method_name = 'from_crawler'
468 +
        elif hasattr(feedcls, 'from_settings'):
469 +
            instance = build_instance(feedcls.from_settings, self.settings)
470 +
            method_name = 'from_settings'
471 +
        else:
472 +
            instance = build_instance(feedcls)
473 +
            method_name = '__new__'
474 +
        if instance is None:
475 +
            raise TypeError("%s.%s returned None" % (feedcls.__qualname__, method_name))
476 +
        return instance
438 477
439 478
    def _get_uri_params(self, spider, uri_params, slot=None):
440 479
        params = {}

@@ -23,10 +23,10 @@
Loading
23 23
24 24
    def _process_finished(self, pp, cmd, check_code):
25 25
        if pp.exitcode and check_code:
26 -
            msg = "process %s exit with code %d" % (cmd, pp.exitcode)
27 -
            msg += "\n>>> stdout <<<\n%s" % pp.out
26 +
            msg = f"process {cmd} exit with code {pp.exitcode}"
27 +
            msg += f"\n>>> stdout <<<\n{pp.out}"
28 28
            msg += "\n"
29 -
            msg += "\n>>> stderr <<<\n%s" % pp.err
29 +
            msg += f"\n>>> stderr <<<\n{pp.err}"
30 30
            raise RuntimeError(msg)
31 31
        return pp.exitcode, pp.out, pp.err
32 32

@@ -50,7 +50,7 @@
Loading
50 50
        key_info.append(ffi_buf_to_string(cname))
51 51
    else:
52 52
        key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
53 -
    key_info.append('%s bits' % pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key))
53 +
    key_info.append(f'{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits')
54 54
    return ', '.join(key_info)
55 55
56 56
@@ -58,4 +58,4 @@
Loading
58 58
    system_openssl = OpenSSL.SSL.SSLeay_version(
59 59
        OpenSSL.SSL.SSLEAY_VERSION
60 60
    ).decode('ascii', errors='replace')
61 -
    return '{} ({})'.format(OpenSSL.version.__version__, system_openssl)
61 +
    return f'{OpenSSL.version.__version__} ({system_openssl})'

@@ -1,6 +1,6 @@
Loading
1 1
def obsolete_setter(setter, attrname):
2 2
    def newsetter(self, value):
3 3
        c = self.__class__.__name__
4 -
        msg = "%s.%s is not modifiable, use %s.replace() instead" % (c, attrname, c)
4 +
        msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
5 5
        raise AttributeError(msg)
6 6
    return newsetter

@@ -45,7 +45,7 @@
Loading
45 45
        elif mimetype in self.classes:
46 46
            return self.classes[mimetype]
47 47
        else:
48 -
            basetype = "%s/*" % mimetype.split('/')[0]
48 +
            basetype = f"{mimetype.split('/')[0]}/*"
49 49
            return self.classes.get(basetype, Response)
50 50
51 51
    def from_content_type(self, content_type, content_encoding=None):

@@ -1,12 +1,16 @@
Loading
1 1
import functools
2 2
import logging
3 3
from collections import defaultdict
4 +
from inspect import signature
5 +
from warnings import warn
6 +
4 7
from twisted.internet.defer import Deferred, DeferredList
5 8
from twisted.python.failure import Failure
6 9
7 10
from scrapy.settings import Settings
8 11
from scrapy.utils.datatypes import SequenceExclude
9 12
from scrapy.utils.defer import mustbe_deferred, defer_result
13 +
from scrapy.utils.deprecate import ScrapyDeprecationWarning
10 14
from scrapy.utils.request import request_fingerprint
11 15
from scrapy.utils.misc import arg_to_iter
12 16
from scrapy.utils.log import failure_to_exc_info
@@ -27,6 +31,7 @@
Loading
27 31
28 32
    def __init__(self, download_func=None, settings=None):
29 33
        self.download_func = download_func
34 +
        self._expects_item = {}
30 35
31 36
        if isinstance(settings, dict) or settings is None:
32 37
            settings = Settings(settings)
@@ -38,6 +43,9 @@
Loading
38 43
        )
39 44
        self._handle_statuses(self.allow_redirects)
40 45
46 +
        # Check if deprecated methods are being used and make them compatible
47 +
        self._make_compatible()
48 +
41 49
    def _handle_statuses(self, allow_redirects):
42 50
        self.handle_httpstatus_list = None
43 51
        if allow_redirects:
@@ -53,7 +61,7 @@
Loading
53 61
        'MYPIPE_IMAGES'
54 62
        """
55 63
        class_name = self.__class__.__name__
56 -
        formatted_key = "{}_{}".format(class_name.upper(), key)
64 +
        formatted_key = f"{class_name.upper()}_{key}"
57 65
        if (
58 66
            not base_class_name
59 67
            or class_name == base_class_name
@@ -77,11 +85,11 @@
Loading
77 85
    def process_item(self, item, spider):
78 86
        info = self.spiderinfo
79 87
        requests = arg_to_iter(self.get_media_requests(item, info))
80 -
        dlist = [self._process_request(r, info) for r in requests]
88 +
        dlist = [self._process_request(r, info, item) for r in requests]
81 89
        dfd = DeferredList(dlist, consumeErrors=1)
82 90
        return dfd.addCallback(self.item_completed, item, info)
83 91
84 -
    def _process_request(self, request, info):
92 +
    def _process_request(self, request, info, item):
85 93
        fp = request_fingerprint(request)
86 94
        cb = request.callback or (lambda _: _)
87 95
        eb = request.errback
@@ -102,34 +110,72 @@
Loading
102 110
103 111
        # Download request checking media_to_download hook output first
104 112
        info.downloading.add(fp)
105 -
        dfd = mustbe_deferred(self.media_to_download, request, info)
106 -
        dfd.addCallback(self._check_media_to_download, request, info)
113 +
        dfd = mustbe_deferred(self.media_to_download, request, info, item=item)
114 +
        dfd.addCallback(self._check_media_to_download, request, info, item=item)
107 115
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
108 116
        dfd.addErrback(lambda f: logger.error(
109 117
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
110 118
        )
111 119
        return dfd.addBoth(lambda _: wad)  # it must return wad at last
112 120
121 +
    def _make_compatible(self):
122 +
        """Make overridable methods of MediaPipeline and subclasses backwards compatible"""
123 +
        methods = [
124 +
            "file_path", "media_to_download", "media_downloaded",
125 +
            "file_downloaded", "image_downloaded", "get_images"
126 +
        ]
127 +
128 +
        for method_name in methods:
129 +
            method = getattr(self, method_name, None)
130 +
            if callable(method):
131 +
                setattr(self, method_name, self._compatible(method))
132 +
133 +
    def _compatible(self, func):
134 +
        """Wrapper for overridable methods to allow backwards compatibility"""
135 +
        self._check_signature(func)
136 +
137 +
        @functools.wraps(func)
138 +
        def wrapper(*args, **kwargs):
139 +
            if self._expects_item[func.__name__]:
140 +
                return func(*args, **kwargs)
141 +
142 +
            kwargs.pop('item', None)
143 +
            return func(*args, **kwargs)
144 +
145 +
        return wrapper
146 +
147 +
    def _check_signature(self, func):
148 +
        sig = signature(func)
149 +
        self._expects_item[func.__name__] = True
150 +
151 +
        if 'item' not in sig.parameters:
152 +
            old_params = str(sig)[1:-1]
153 +
            new_params = old_params + ", *, item=None"
154 +
            warn(f'{func.__name__}(self, {old_params}) is deprecated, '
155 +
                 f'please use {func.__name__}(self, {new_params})',
156 +
                 ScrapyDeprecationWarning, stacklevel=2)
157 +
            self._expects_item[func.__name__] = False
158 +
113 159
    def _modify_media_request(self, request):
114 160
        if self.handle_httpstatus_list:
115 161
            request.meta['handle_httpstatus_list'] = self.handle_httpstatus_list
116 162
        else:
117 163
            request.meta['handle_httpstatus_all'] = True
118 164
119 -
    def _check_media_to_download(self, result, request, info):
165 +
    def _check_media_to_download(self, result, request, info, item):
120 166
        if result is not None:
121 167
            return result
122 168
        if self.download_func:
123 169
            # this ugly code was left only to support tests. TODO: remove
124 170
            dfd = mustbe_deferred(self.download_func, request, info.spider)
125 171
            dfd.addCallbacks(
126 -
                callback=self.media_downloaded, callbackArgs=(request, info),
172 +
                callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
127 173
                errback=self.media_failed, errbackArgs=(request, info))
128 174
        else:
129 175
            self._modify_media_request(request)
130 176
            dfd = self.crawler.engine.download(request, info.spider)
131 177
            dfd.addCallbacks(
132 -
                callback=self.media_downloaded, callbackArgs=(request, info),
178 +
                callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
133 179
                errback=self.media_failed, errbackArgs=(request, info))
134 180
        return dfd
135 181
@@ -171,7 +217,7 @@
Loading
171 217
            defer_result(result).chainDeferred(wad)
172 218
173 219
    # Overridable Interface
174 -
    def media_to_download(self, request, info):
220 +
    def media_to_download(self, request, info, *, item=None):
175 221
        """Check request before starting download"""
176 222
        pass
177 223
@@ -179,7 +225,7 @@
Loading
179 225
        """Returns the media requests to download"""
180 226
        pass
181 227
182 -
    def media_downloaded(self, response, request, info):
228 +
    def media_downloaded(self, response, request, info, *, item=None):
183 229
        """Handler for success downloads"""
184 230
        return response
185 231
@@ -199,3 +245,7 @@
Loading
199 245
                        extra={'spider': info.spider}
200 246
                    )
201 247
        return item
248 +
249 +
    def file_path(self, request, response=None, info=None, *, item=None):
250 +
        """Returns the path where downloaded media should be stored"""
251 +
        pass

@@ -11,8 +11,8 @@
Loading
11 11
    abspath = os.path.abspath(filepath)
12 12
    dirname, file = os.path.split(abspath)
13 13
    fname, fext = os.path.splitext(file)
14 -
    if fext != '.py':
15 -
        raise ValueError("Not a Python source file: %s" % abspath)
14 +
    if fext not in ('.py', '.pyw'):
15 +
        raise ValueError(f"Not a Python source file: {abspath}")
16 16
    if dirname:
17 17
        sys.path = [dirname] + sys.path
18 18
    try:
@@ -42,14 +42,14 @@
Loading
42 42
            raise UsageError()
43 43
        filename = args[0]
44 44
        if not os.path.exists(filename):
45 -
            raise UsageError("File not found: %s\n" % filename)
45 +
            raise UsageError(f"File not found: {filename}\n")
46 46
        try:
47 47
            module = _import_file(filename)
48 48
        except (ImportError, ValueError) as e:
49 -
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
49 +
            raise UsageError(f"Unable to load {filename!r}: {e}\n")
50 50
        spclasses = list(iter_spider_classes(module))
51 51
        if not spclasses:
52 -
            raise UsageError("No spider found in file: %s\n" % filename)
52 +
            raise UsageError(f"No spider found in file: {filename}\n")
53 53
        spidercls = spclasses.pop()
54 54
55 55
        self.crawler_process.crawl(spidercls, **opts.spargs)

@@ -54,8 +54,8 @@
Loading
54 54
55 55
    def crawled(self, request, response, spider):
56 56
        """Logs a message when the crawler finds a webpage."""
57 -
        request_flags = ' %s' % str(request.flags) if request.flags else ''
58 -
        response_flags = ' %s' % str(response.flags) if response.flags else ''
57 +
        request_flags = f' {str(request.flags)}' if request.flags else ''
58 +
        response_flags = f' {str(response.flags)}' if response.flags else ''
59 59
        return {
60 60
            'level': logging.DEBUG,
61 61
            'msg': CRAWLEDMSG,

@@ -61,7 +61,7 @@
Loading
61 61
        group.add_option("--logfile", metavar="FILE",
62 62
                         help="log file. if omitted stderr will be used")
63 63
        group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
64 -
                         help="log level (default: %s)" % self.settings['LOG_LEVEL'])
64 +
                         help=f"log level (default: {self.settings['LOG_LEVEL']})")
65 65
        group.add_option("--nolog", action="store_true",
66 66
                         help="disable logging completely")
67 67
        group.add_option("--profile", metavar="FILE", default=None,
@@ -115,9 +115,11 @@
Loading
115 115
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
116 116
                          help="set spider argument (may be repeated)")
117 117
        parser.add_option("-o", "--output", metavar="FILE", action="append",
118 -
                          help="dump scraped items into FILE (use - for stdout)")
118 +
                          help="append scraped items to the end of FILE (use - for stdout)")
119 +
        parser.add_option("-O", "--overwrite-output", metavar="FILE", action="append",
120 +
                          help="dump scraped items into FILE, overwriting any existing file")
119 121
        parser.add_option("-t", "--output-format", metavar="FORMAT",
120 -
                          help="format to use for dumping items with -o")
122 +
                          help="format to use for dumping items")
121 123
122 124
    def process_options(self, args, opts):
123 125
        ScrapyCommand.process_options(self, args, opts)
@@ -125,6 +127,11 @@
Loading
125 127
            opts.spargs = arglist_to_dict(opts.spargs)
126 128
        except ValueError:
127 129
            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
128 -
        if opts.output:
129 -
            feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
130 +
        if opts.output or opts.overwrite_output:
131 +
            feeds = feed_process_params_from_cli(
132 +
                self.settings,
133 +
                opts.output,
134 +
                opts.output_format,
135 +
                opts.overwrite_output,
136 +
            )
130 137
            self.settings.set('FEEDS', feeds, priority='cmdline')

@@ -1,11 +1,32 @@
Loading
1 1
"""Boto/botocore helpers"""
2 +
import warnings
2 3
3 -
from scrapy.exceptions import NotConfigured
4 +
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
4 5
5 6
6 7
def is_botocore():
8 +
    """ Returns True if botocore is available, otherwise raises NotConfigured. Never returns False.
9 +
10 +
    Previously, when boto was supported in addition to botocore, this returned False if boto was available
11 +
    but botocore wasn't.
12 +
    """
13 +
    message = (
14 +
        'is_botocore() is deprecated and always returns True or raises an Exception, '
15 +
        'so it cannot be used for checking if boto is available instead of botocore. '
16 +
        'You can use scrapy.utils.boto.is_botocore_available() to check if botocore '
17 +
        'is available.'
18 +
    )
19 +
    warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
7 20
    try:
8 21
        import botocore  # noqa: F401
9 22
        return True
10 23
    except ImportError:
11 24
        raise NotConfigured('missing botocore library')
25 +
26 +
27 +
def is_botocore_available():
28 +
    try:
29 +
        import botocore  # noqa: F401
30 +
        return True
31 +
    except ImportError:
32 +
        return False

@@ -24,11 +24,11 @@
Loading
24 24
        o = cls(crawler.stats, recipients, mail)
25 25
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
26 26
        return o
27 -
        
27 +
28 28
    def spider_closed(self, spider):
29 29
        spider_stats = self.stats.get_stats(spider)
30 30
        body = "Global stats\n\n"
31 -
        body += "\n".join("%-50s : %s" % i for i in self.stats.get_stats().items())
32 -
        body += "\n\n%s stats\n\n" % spider.name
33 -
        body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
34 -
        return self.mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
31 +
        body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
32 +
        body += f"\n\n{spider.name} stats\n\n"
33 +
        body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
34 +
        return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)

@@ -55,8 +55,8 @@
Loading
55 55
        if isinstance(url, str):
56 56
            self._url = url
57 57
        else:
58 -
            raise TypeError('%s url must be str, got %s:' %
59 -
                            (type(self).__name__, type(url).__name__))
58 +
            raise TypeError(f'{type(self).__name__} url must be str, '
59 +
                            f'got {type(url).__name__}')
60 60
61 61
    url = property(_get_url, obsolete_setter(_set_url, 'url'))
62 62
@@ -77,7 +77,7 @@
Loading
77 77
    body = property(_get_body, obsolete_setter(_set_body, 'body'))
78 78
79 79
    def __str__(self):
80 -
        return "<%d %s>" % (self.status, self.url)
80 +
        return f"<{self.status} {self.url}>"
81 81
82 82
    __repr__ = __str__
83 83

@@ -17,8 +17,8 @@
Loading
17 17
18 18
    def _check_components(complist):
19 19
        if len({convert(c) for c in complist}) != len(complist):
20 -
            raise ValueError('Some paths in {!r} convert to the same object, '
21 -
                             'please update your settings'.format(complist))
20 +
            raise ValueError(f'Some paths in {complist!r} convert to the same object, '
21 +
                             'please update your settings')
22 22
23 23
    def _map_keys(compdict):
24 24
        if isinstance(compdict, BaseSettings):
@@ -26,9 +26,10 @@
Loading
26 26
            for k, v in compdict.items():
27 27
                prio = compdict.getpriority(k)
28 28
                if compbs.getpriority(convert(k)) == prio:
29 -
                    raise ValueError('Some paths in {!r} convert to the same '
29 +
                    raise ValueError(f'Some paths in {list(compdict.keys())!r} '
30 +
                                     'convert to the same '
30 31
                                     'object, please update your settings'
31 -
                                     ''.format(list(compdict.keys())))
32 +
                                     )
32 33
                else:
33 34
                    compbs.set(convert(k), v, priority=prio)
34 35
            return compbs
@@ -40,8 +41,8 @@
Loading
40 41
        """Fail if a value in the components dict is not a real number or None."""
41 42
        for name, value in compdict.items():
42 43
            if value is not None and not isinstance(value, numbers.Real):
43 -
                raise ValueError('Invalid value {} for component {}, please provide '
44 -
                                 'a real number or None instead'.format(value, name))
44 +
                raise ValueError(f'Invalid value {value} for component {name}, '
45 +
                                 'please provide a real number or None instead')
45 46
46 47
    # BEGIN Backward compatibility for old (base, custom) call signature
47 48
    if isinstance(custom, (list, tuple)):
@@ -120,6 +121,7 @@
Loading
120 121
    out.setdefault("fields", settings.getdictorlist("FEED_EXPORT_FIELDS") or None)
121 122
    out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
122 123
    out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
124 +
    out.setdefault("item_export_kwargs", dict())
123 125
    if settings["FEED_EXPORT_INDENT"] is None:
124 126
        out.setdefault("indent", None)
125 127
    else:
@@ -127,7 +129,8 @@
Loading
127 129
    return out
128 130
129 131
130 -
def feed_process_params_from_cli(settings, output, output_format=None):
132 +
def feed_process_params_from_cli(settings, output, output_format=None,
133 +
                                 overwrite_output=None):
131 134
    """
132 135
    Receives feed export params (from the 'crawl' or 'runspider' commands),
133 136
    checks for inconsistencies in their quantities and returns a dictionary
@@ -139,22 +142,37 @@
Loading
139 142
140 143
    def check_valid_format(output_format):
141 144
        if output_format not in valid_output_formats:
142 -
            raise UsageError("Unrecognized output format '%s', set one after a"
143 -
                             " colon using the -o option (i.e. -o <URI>:<FORMAT>)"
144 -
                             " or as a file extension, from the supported list %s" %
145 -
                             (output_format, tuple(valid_output_formats)))
145 +
            raise UsageError(
146 +
                f"Unrecognized output format '{output_format}'. "
147 +
                f"Set a supported one ({tuple(valid_output_formats)}) "
148 +
                "after a colon at the end of the output URI (i.e. -o/-O "
149 +
                "<URI>:<FORMAT>) or as a file extension."
150 +
            )
151 +
152 +
    overwrite = False
153 +
    if overwrite_output:
154 +
        if output:
155 +
            raise UsageError(
156 +
                "Please use only one of -o/--output and -O/--overwrite-output"
157 +
            )
158 +
        output = overwrite_output
159 +
        overwrite = True
146 160
147 161
    if output_format:
148 162
        if len(output) == 1:
149 163
            check_valid_format(output_format)
150 -
            warnings.warn('The -t command line option is deprecated in favor'
151 -
                          ' of specifying the output format within the -o'
152 -
                          ' option, please check the -o option docs for more details',
153 -
                          category=ScrapyDeprecationWarning, stacklevel=2)
164 +
            message = (
165 +
                'The -t command line option is deprecated in favor of '
166 +
                'specifying the output format within the output URI. See the '
167 +
                'documentation of the -o and -O options for more information.',
168 +
            )
169 +
            warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
154 170
            return {output[0]: {'format': output_format}}
155 171
        else:
156 -
            raise UsageError('The -t command line option cannot be used if multiple'
157 -
                             ' output files are specified with the -o option')
172 +
            raise UsageError(
173 +
                'The -t command-line option cannot be used if multiple output '
174 +
                'URIs are specified'
175 +
            )
158 176
159 177
    result = {}
160 178
    for element in output:
@@ -168,8 +186,10 @@
Loading
168 186
                feed_uri = 'stdout:'
169 187
        check_valid_format(feed_format)
170 188
        result[feed_uri] = {'format': feed_format}
189 +
        if overwrite:
190 +
            result[feed_uri]['overwrite'] = True
171 191
172 -
    # FEEDS setting should take precedence over the -o and -t CLI options
192 +
    # FEEDS setting should take precedence over the matching CLI options
173 193
    result.update(settings.getdict('FEEDS'))
174 194
175 195
    return result

@@ -14,9 +14,9 @@
Loading
14 14
    def deco(func):
15 15
        @wraps(func)
16 16
        def wrapped(*args, **kwargs):
17 -
            message = "Call to deprecated function %s." % func.__name__
17 +
            message = f"Call to deprecated function {func.__name__}."
18 18
            if use_instead:
19 -
                message += " Use %s instead." % use_instead
19 +
                message += f" Use {use_instead} instead."
20 20
            warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
21 21
            return func(*args, **kwargs)
22 22
        return wrapped

@@ -5,7 +5,6 @@
Loading
5 5
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
6 6
from twisted.internet.ssl import AcceptableCiphers
7 7
8 -
from scrapy import twisted_version
9 8
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
10 9
11 10
@@ -28,13 +27,6 @@
Loading
28 27
}
29 28
30 29
31 -
if twisted_version < (17, 0, 0):
32 -
    from twisted.internet._sslverify import _maybeSetHostNameIndication as set_tlsext_host_name
33 -
else:
34 -
    def set_tlsext_host_name(connection, hostNameBytes):
35 -
        connection.set_tlsext_host_name(hostNameBytes)
36 -
37 -
38 30
class ScrapyClientTLSOptions(ClientTLSOptions):
39 31
    """
40 32
    SSL Client connection creator ignoring certificate verification errors
@@ -52,21 +44,14 @@
Loading
52 44
53 45
    def _identityVerifyingInfoCallback(self, connection, where, ret):
54 46
        if where & SSL.SSL_CB_HANDSHAKE_START:
55 -
            set_tlsext_host_name(connection, self._hostnameBytes)
47 +
            connection.set_tlsext_host_name(self._hostnameBytes)
56 48
        elif where & SSL.SSL_CB_HANDSHAKE_DONE:
57 49
            if self.verbose_logging:
58 -
                if hasattr(connection, 'get_cipher_name'):  # requires pyOPenSSL 0.15
59 -
                    if hasattr(connection, 'get_protocol_version_name'):  # requires pyOPenSSL 16.0.0
60 -
                        logger.debug('SSL connection to %s using protocol %s, cipher %s',
61 -
                                     self._hostnameASCII,
62 -
                                     connection.get_protocol_version_name(),
63 -
                                     connection.get_cipher_name(),
64 -
                                     )
65 -
                    else:
66 -
                        logger.debug('SSL connection to %s using cipher %s',
67 -
                                     self._hostnameASCII,
68 -
                                     connection.get_cipher_name(),
69 -
                                     )
50 +
                logger.debug('SSL connection to %s using protocol %s, cipher %s',
51 +
                             self._hostnameASCII,
52 +
                             connection.get_protocol_version_name(),
53 +
                             connection.get_cipher_name(),
54 +
                             )
70 55
                server_cert = connection.get_peer_certificate()
71 56
                logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
72 57
                             x509name_to_string(server_cert.get_issuer()),

@@ -25,13 +25,13 @@
Loading
25 25
        self._set_url(url)
26 26
        self._set_body(body)
27 27
        if not isinstance(priority, int):
28 -
            raise TypeError("Request priority not an integer: %r" % priority)
28 +
            raise TypeError(f"Request priority not an integer: {priority!r}")
29 29
        self.priority = priority
30 30
31 31
        if callback is not None and not callable(callback):
32 -
            raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
32 +
            raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
33 33
        if errback is not None and not callable(errback):
34 -
            raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
34 +
            raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
35 35
        self.callback = callback
36 36
        self.errback = errback
37 37
@@ -60,13 +60,17 @@
Loading
60 60
61 61
    def _set_url(self, url):
62 62
        if not isinstance(url, str):
63 -
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
63 +
            raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
64 64
65 65
        s = safe_url_string(url, self.encoding)
66 66
        self._url = escape_ajax(s)
67 67
68 -
        if ('://' not in self._url) and (not self._url.startswith('data:')):
69 -
            raise ValueError('Missing scheme in request url: %s' % self._url)
68 +
        if (
69 +
            '://' not in self._url
70 +
            and not self._url.startswith('about:')
71 +
            and not self._url.startswith('data:')
72 +
        ):
73 +
            raise ValueError(f'Missing scheme in request url: {self._url}')
70 74
71 75
    url = property(_get_url, obsolete_setter(_set_url, 'url'))
72 76
@@ -86,7 +90,7 @@
Loading
86 90
        return self._encoding
87 91
88 92
    def __str__(self):
89 -
        return "<%s %s>" % (self.method, self.url)
93 +
        return f"<{self.method} {self.url}>"
90 94
91 95
    __repr__ = __str__
92 96

@@ -28,8 +28,8 @@
Loading
28 28
29 29
30 30
# Check minimum required Python version
31 -
if sys.version_info < (3, 5, 2):
32 -
    print("Scrapy %s requires Python 3.5.2" % __version__)
31 +
if sys.version_info < (3, 6):
32 +
    print("Scrapy %s requires Python 3.6+" % __version__)
33 33
    sys.exit(1)
34 34
35 35

@@ -41,9 +41,7 @@
Loading
41 41
        if issubclass(cls, ignore):
42 42
            continue
43 43
        oldest = min(wdict.values())
44 -
        s += "%-30s %6d   oldest: %ds ago\n" % (
45 -
            cls.__name__, len(wdict), now - oldest
46 -
        )
44 +
        s += f"{cls.__name__:<30} {len(wdict):6}   oldest: {int(now - oldest)}s ago\n"
47 45
    return s
48 46
49 47

@@ -171,8 +171,8 @@
Loading
171 171
    def _handle_downloader_output(self, response, request, spider):
172 172
        if not isinstance(response, (Request, Response, Failure)):
173 173
            raise TypeError(
174 -
                "Incorrect type: expected Request, Response or Failure, got %s: %r"
175 -
                % (type(response), response)
174 +
                "Incorrect type: expected Request, Response or Failure, got "
175 +
                f"{type(response)}: {response!r}"
176 176
            )
177 177
        # downloader middleware can return requests (for example, redirects)
178 178
        if isinstance(response, Request):
@@ -214,7 +214,7 @@
Loading
214 214
215 215
    def crawl(self, request, spider):
216 216
        if spider not in self.open_spiders:
217 -
            raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
217 +
            raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
218 218
        self.schedule(request, spider)
219 219
        self.slot.nextcall.schedule()
220 220
@@ -239,16 +239,21 @@
Loading
239 239
        def _on_success(response):
240 240
            if not isinstance(response, (Response, Request)):
241 241
                raise TypeError(
242 -
                    "Incorrect type: expected Response or Request, got %s: %r"
243 -
                    % (type(response), response)
242 +
                    "Incorrect type: expected Response or Request, got "
243 +
                    f"{type(response)}: {response!r}"
244 244
                )
245 245
            if isinstance(response, Response):
246 -
                response.request = request  # tie request to response received
247 -
                logkws = self.logformatter.crawled(request, response, spider)
246 +
                if response.request is None:
247 +
                    response.request = request
248 +
                logkws = self.logformatter.crawled(response.request, response, spider)
248 249
                if logkws is not None:
249 250
                    logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
250 -
                self.signals.send_catch_log(signals.response_received,
251 -
                                            response=response, request=request, spider=spider)
251 +
                self.signals.send_catch_log(
252 +
                    signal=signals.response_received,
253 +
                    response=response,
254 +
                    request=response.request,
255 +
                    spider=spider,
256 +
                )
252 257
            return response
253 258
254 259
        def _on_complete(_):
@@ -263,7 +268,7 @@
Loading
263 268
    @defer.inlineCallbacks
264 269
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
265 270
        if not self.has_capacity():
266 -
            raise RuntimeError("No free spider slot when opening %r" % spider.name)
271 +
            raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
267 272
        logger.info("Spider opened", extra={'spider': spider})
268 273
        nextcall = CallLaterOnce(self._next_request, spider)
269 274
        scheduler = self.scheduler_cls.from_crawler(self.crawler)

@@ -180,9 +180,9 @@
Loading
180 180
        :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
181 181
            :class:`~scrapy.spiders.Spider` subclass or string
182 182
183 -
        :param list args: arguments to initialize the spider
183 +
        :param args: arguments to initialize the spider
184 184
185 -
        :param dict kwargs: keyword arguments to initialize the spider
185 +
        :param kwargs: keyword arguments to initialize the spider
186 186
        """
187 187
        if isinstance(crawler_or_spidercls, Spider):
188 188
            raise ValueError(
@@ -307,7 +307,7 @@
Loading
307 307
        If ``stop_after_crawl`` is True, the reactor will be stopped after all
308 308
        crawlers have finished, using :meth:`join`.
309 309
310 -
        :param boolean stop_after_crawl: stop or not the reactor when all
310 +
        :param bool stop_after_crawl: stop or not the reactor when all
311 311
            crawlers have finished
312 312
        """
313 313
        from twisted.internet import reactor
@@ -340,5 +340,5 @@
Loading
340 340
341 341
    def _handle_twisted_reactor(self):
342 342
        if self.settings.get("TWISTED_REACTOR"):
343 -
            install_reactor(self.settings["TWISTED_REACTOR"])
343 +
            install_reactor(self.settings["TWISTED_REACTOR"], self.settings["ASYNCIO_EVENT_LOOP"])
344 344
        super()._handle_twisted_reactor()

@@ -5,6 +5,7 @@
Loading
5 5
import re
6 6
import hashlib
7 7
import warnings
8 +
from collections import deque
8 9
from contextlib import contextmanager
9 10
from importlib import import_module
10 11
from pkgutil import iter_modules
@@ -38,14 +39,24 @@
Loading
38 39
def load_object(path):
39 40
    """Load an object given its absolute object path, and return it.
40 41
41 -
    object can be the import path of a class, function, variable or an
42 -
    instance, e.g. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'
42 +
    The object can be the import path of a class, function, variable or an
43 +
    instance, e.g. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'.
44 +
45 +
    If ``path`` is not a string, but is a callable object, such as a class or
46 +
    a function, then return it as is.
43 47
    """
44 48
49 +
    if not isinstance(path, str):
50 +
        if callable(path):
51 +
            return path
52 +
        else:
53 +
            raise TypeError("Unexpected argument type, expected string "
54 +
                            "or object, got: %s" % type(path))
55 +
45 56
    try:
46 57
        dot = path.rindex('.')
47 58
    except ValueError:
48 -
        raise ValueError("Error loading object '%s': not a full path" % path)
59 +
        raise ValueError(f"Error loading object '{path}': not a full path")
49 60
50 61
    module, name = path[:dot], path[dot + 1:]
51 62
    mod = import_module(module)
@@ -53,7 +64,7 @@
Loading
53 64
    try:
54 65
        obj = getattr(mod, name)
55 66
    except AttributeError:
56 -
        raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
67 +
        raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
57 68
58 69
    return obj
59 70
@@ -162,7 +173,7 @@
Loading
162 173
        instance = objcls(*args, **kwargs)
163 174
        method_name = '__new__'
164 175
    if instance is None:
165 -
        raise TypeError("%s.%s returned None" % (objcls.__qualname__, method_name))
176 +
        raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
166 177
    return instance
167 178
168 179
@@ -184,6 +195,22 @@
Loading
184 195
                os.environ[k] = v
185 196
186 197
198 +
def walk_callable(node):
199 +
    """Similar to ``ast.walk``, but walks only function body and skips nested
200 +
    f