scrapy / scrapy
1
"""Download handlers for http and https schemes"""
2

3 4
import ipaddress
4 4
import logging
5 4
import re
6 4
import warnings
7 4
from contextlib import suppress
8 4
from io import BytesIO
9 4
from time import time
10 4
from urllib.parse import urldefrag
11

12 4
from twisted.internet import defer, protocol, ssl
13 4
from twisted.internet.endpoints import TCP4ClientEndpoint
14 4
from twisted.internet.error import TimeoutError
15 4
from twisted.python.failure import Failure
16 4
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
17 4
from twisted.web.http import _DataLoss, PotentialDataLoss
18 4
from twisted.web.http_headers import Headers as TxHeaders
19 4
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
20 4
from zope.interface import implementer
21

22 4
from scrapy import signals
23 4
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
24 4
from scrapy.core.downloader.webclient import _parse
25 4
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
26 4
from scrapy.http import Headers
27 4
from scrapy.responsetypes import responsetypes
28 4
from scrapy.utils.python import to_bytes, to_unicode
29

30

31 4
logger = logging.getLogger(__name__)
32

33

34 4
class HTTP11DownloadHandler:
35 4
    lazy = False
36

37 4
    def __init__(self, settings, crawler=None):
38 4
        self._crawler = crawler
39

40 4
        from twisted.internet import reactor
41 4
        self._pool = HTTPConnectionPool(reactor, persistent=True)
42 4
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
43 4
        self._pool._factory.noisy = False
44

45 4
        self._contextFactory = load_context_factory_from_settings(settings, crawler)
46 4
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
47 4
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
48 4
        self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
49 4
        self._disconnect_timeout = 1
50

51 4
    @classmethod
52 1
    def from_crawler(cls, crawler):
53 4
        return cls(crawler.settings, crawler)
54

55 4
    def download_request(self, request, spider):
56
        """Return a deferred for the HTTP download"""
57 4
        agent = ScrapyAgent(
58
            contextFactory=self._contextFactory,
59
            pool=self._pool,
60
            maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
61
            warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
62
            fail_on_dataloss=self._fail_on_dataloss,
63
            crawler=self._crawler,
64
        )
65 4
        return agent.download_request(request)
66

67 4
    def close(self):
68 4
        from twisted.internet import reactor
69 4
        d = self._pool.closeCachedConnections()
70
        # closeCachedConnections will hang on network or server issues, so
71
        # we'll manually timeout the deferred.
72
        #
73
        # Twisted issue addressing this problem can be found here:
74
        # https://twistedmatrix.com/trac/ticket/7738.
75
        #
76
        # closeCachedConnections doesn't handle external errbacks, so we'll
77
        # issue a callback after `_disconnect_timeout` seconds.
78 4
        delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
79

80 4
        def cancel_delayed_call(result):
81 4
            if delayed_call.active():
82 4
                delayed_call.cancel()
83 4
            return result
84

85 4
        d.addBoth(cancel_delayed_call)
86 4
        return d
87

88

89 4
class TunnelError(Exception):
90
    """An HTTP CONNECT tunnel could not be established by the proxy."""
91

92

93 4
class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
94
    """An endpoint that tunnels through proxies to allow HTTPS downloads. To
95
    accomplish that, this endpoint sends an HTTP CONNECT to the proxy.
96
    The HTTP CONNECT is always sent when using this endpoint, I think this could
97
    be improved as the CONNECT will be redundant if the connection associated
98
    with this endpoint comes from the pool and a CONNECT has already been issued
99
    for it.
100
    """
101 4
    _truncatedLength = 1000
102 4
    _responseAnswer = r'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,' + str(_truncatedLength) + r'})'
103 4
    _responseMatcher = re.compile(_responseAnswer.encode())
104

105 4
    def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
106 4
        proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
107 4
        super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
108 4
        self._tunnelReadyDeferred = defer.Deferred()
109 4
        self._tunneledHost = host
110 4
        self._tunneledPort = port
111 4
        self._contextFactory = contextFactory
112 4
        self._connectBuffer = bytearray()
113

114 4
    def requestTunnel(self, protocol):
115
        """Asks the proxy to open a tunnel."""
116 4
        tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
117 4
        protocol.transport.write(tunnelReq)
118 4
        self._protocolDataReceived = protocol.dataReceived
119 4
        protocol.dataReceived = self.processProxyResponse
120 4
        self._protocol = protocol
121 4
        return protocol
122

123 4
    def processProxyResponse(self, rcvd_bytes):
124
        """Processes the response from the proxy. If the tunnel is successfully
125
        created, notifies the client that we are ready to send requests. If not
126
        raises a TunnelError.
127
        """
128 4
        self._connectBuffer += rcvd_bytes
129
        # make sure that enough (all) bytes are consumed
130
        # and that we've got all HTTP headers (ending with a blank line)
131
        # from the proxy so that we don't send those bytes to the TLS layer
132
        #
133
        # see https://github.com/scrapy/scrapy/issues/2491
134 4
        if b'\r\n\r\n' not in self._connectBuffer:
135 0
            return
136 4
        self._protocol.dataReceived = self._protocolDataReceived
137 4
        respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
138 4
        if respm and int(respm.group('status')) == 200:
139
            # set proper Server Name Indication extension
140 4
            sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
141 4
            self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
142 4
            self._tunnelReadyDeferred.callback(self._protocol)
143
        else:
144 4
            if respm:
145 2
                extra = {'status': int(respm.group('status')),
146
                         'reason': respm.group('reason').strip()}
147
            else:
148 0
                extra = rcvd_bytes[:self._truncatedLength]
149 2
            self._tunnelReadyDeferred.errback(
150
                TunnelError('Could not open CONNECT tunnel with proxy '
151
                            f'{self._host}:{self._port} [{extra!r}]')
152
            )
153

154 4
    def connectFailed(self, reason):
155
        """Propagates the errback to the appropriate deferred."""
156 0
        self._tunnelReadyDeferred.errback(reason)
157

158 4
    def connect(self, protocolFactory):
159 4
        self._protocolFactory = protocolFactory
160 4
        connectDeferred = super().connect(protocolFactory)
161 4
        connectDeferred.addCallback(self.requestTunnel)
162 4
        connectDeferred.addErrback(self.connectFailed)
163 4
        return self._tunnelReadyDeferred
164

165

166 4
def tunnel_request_data(host, port, proxy_auth_header=None):
167
    r"""
168
    Return binary content of a CONNECT request.
169

170
    >>> from scrapy.utils.python import to_unicode as s
171
    >>> s(tunnel_request_data("example.com", 8080))
172
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
173
    >>> s(tunnel_request_data("example.com", 8080, b"123"))
174
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
175
    >>> s(tunnel_request_data(b"example.com", "8090"))
176
    'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
177
    """
178 4
    host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
179 4
    tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
180 4
    tunnel_req += b'Host: ' + host_value + b'\r\n'
181 4
    if proxy_auth_header:
182 4
        tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
183 4
    tunnel_req += b'\r\n'
184 4
    return tunnel_req
185

186

187 4
class TunnelingAgent(Agent):
188
    """An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
189
    downloads. It may look strange that we have chosen to subclass Agent and not
190
    ProxyAgent but consider that after the tunnel is opened the proxy is
191
    transparent to the client; thus the agent should behave like there is no
192
    proxy involved.
193
    """
194

195 4
    def __init__(self, reactor, proxyConf, contextFactory=None,
196
                 connectTimeout=None, bindAddress=None, pool=None):
197 4
        super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
198 4
        self._proxyConf = proxyConf
199 4
        self._contextFactory = contextFactory
200

201 4
    def _getEndpoint(self, uri):
202 4
        return TunnelingTCP4ClientEndpoint(
203
            reactor=self._reactor,
204
            host=uri.host,
205
            port=uri.port,
206
            proxyConf=self._proxyConf,
207
            contextFactory=self._contextFactory,
208
            timeout=self._endpointFactory._connectTimeout,
209
            bindAddress=self._endpointFactory._bindAddress,
210
        )
211

212 4
    def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
213
        # proxy host and port are required for HTTP pool `key`
214
        # otherwise, same remote host connection request could reuse
215
        # a cached tunneled connection to a different proxy
216 4
        key = key + self._proxyConf
217 4
        return super()._requestWithEndpoint(
218
            key=key,
219
            endpoint=endpoint,
220
            method=method,
221
            parsedURI=parsedURI,
222
            headers=headers,
223
            bodyProducer=bodyProducer,
224
            requestPath=requestPath,
225
        )
226

227

228 4
class ScrapyProxyAgent(Agent):
229

230 4
    def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
231 4
        super().__init__(
232
            reactor=reactor,
233
            connectTimeout=connectTimeout,
234
            bindAddress=bindAddress,
235
            pool=pool,
236
        )
237 4
        self._proxyURI = URI.fromBytes(proxyURI)
238

239 4
    def request(self, method, uri, headers=None, bodyProducer=None):
240
        """
241
        Issue a new request via the configured proxy.
242
        """
243
        # Cache *all* connections under the same key, since we are only
244
        # connecting to a single destination, the proxy:
245 4
        return self._requestWithEndpoint(
246
            key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
247
            endpoint=self._getEndpoint(self._proxyURI),
248
            method=method,
249
            parsedURI=URI.fromBytes(uri),
250
            headers=headers,
251
            bodyProducer=bodyProducer,
252
            requestPath=uri,
253
        )
254

255

256 4
class ScrapyAgent:
257

258 4
    _Agent = Agent
259 4
    _ProxyAgent = ScrapyProxyAgent
260 4
    _TunnelingAgent = TunnelingAgent
261

262 4
    def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
263
                 maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
264 4
        self._contextFactory = contextFactory
265 4
        self._connectTimeout = connectTimeout
266 4
        self._bindAddress = bindAddress
267 4
        self._pool = pool
268 4
        self._maxsize = maxsize
269 4
        self._warnsize = warnsize
270 4
        self._fail_on_dataloss = fail_on_dataloss
271 4
        self._txresponse = None
272 4
        self._crawler = crawler
273

274 4
    def _get_agent(self, request, timeout):
275 4
        from twisted.internet import reactor
276 4
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
277 4
        proxy = request.meta.get('proxy')
278 4
        if proxy:
279 4
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
280 4
            scheme = _parse(request.url)[0]
281 4
            proxyHost = to_unicode(proxyHost)
282 4
            omitConnectTunnel = b'noconnect' in proxyParams
283 4
            if omitConnectTunnel:
284 4
                warnings.warn(
285
                    "Using HTTPS proxies in the noconnect mode is deprecated. "
286
                    "If you use Zyte Smart Proxy Manager, it doesn't require "
287
                    "this mode anymore, so you should update scrapy-crawlera "
288
                    "to scrapy-zyte-smartproxy and remove '?noconnect' "
289
                    "from the Zyte Smart Proxy Manager URL.",
290
                    ScrapyDeprecationWarning,
291
                )
292 4
            if scheme == b'https' and not omitConnectTunnel:
293 4
                proxyAuth = request.headers.get(b'Proxy-Authorization', None)
294 4
                proxyConf = (proxyHost, proxyPort, proxyAuth)
295 4
                return self._TunnelingAgent(
296
                    reactor=reactor,
297
                    proxyConf=proxyConf,
298
                    contextFactory=self._contextFactory,
299
                    connectTimeout=timeout,
300
                    bindAddress=bindaddress,
301
                    pool=self._pool,
302
                )
303
            else:
304 4
                return self._ProxyAgent(
305
                    reactor=reactor,
306
                    proxyURI=to_bytes(proxy, encoding='ascii'),
307
                    connectTimeout=timeout,
308
                    bindAddress=bindaddress,
309
                    pool=self._pool,
310
                )
311

312 4
        return self._Agent(
313
            reactor=reactor,
314
            contextFactory=self._contextFactory,
315
            connectTimeout=timeout,
316
            bindAddress=bindaddress,
317
            pool=self._pool,
318
        )
319

320 4
    def download_request(self, request):
321 4
        from twisted.internet import reactor
322 4
        timeout = request.meta.get('download_timeout') or self._connectTimeout
323 4
        agent = self._get_agent(request, timeout)
324

325
        # request details
326 4
        url = urldefrag(request.url)[0]
327 4
        method = to_bytes(request.method)
328 4
        headers = TxHeaders(request.headers)
329 4
        if isinstance(agent, self._TunnelingAgent):
330 4
            headers.removeHeader(b'Proxy-Authorization')
331 4
        if request.body:
332 4
            bodyproducer = _RequestBodyProducer(request.body)
333
        else:
334 4
            bodyproducer = None
335 4
        start_time = time()
336 4
        d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
337
        # set download latency
338 4
        d.addCallback(self._cb_latency, request, start_time)
339
        # response body is ready to be consumed
340 4
        d.addCallback(self._cb_bodyready, request)
341 4
        d.addCallback(self._cb_bodydone, request, url)
342
        # check download timeout
343 4
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
344 4
        d.addBoth(self._cb_timeout, request, url, timeout)
345 4
        return d
346

347 4
    def _cb_timeout(self, result, request, url, timeout):
348 4
        if self._timeout_cl.active():
349 4
            self._timeout_cl.cancel()
350 4
            return result
351
        # needed for HTTPS requests, otherwise _ResponseReader doesn't
352
        # receive connectionLost()
353 4
        if self._txresponse:
354 4
            self._txresponse._transport.stopProducing()
355

356 4
        raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
357

358 4
    def _cb_latency(self, result, request, start_time):
359 4
        request.meta['download_latency'] = time() - start_time
360 4
        return result
361

362 4
    @staticmethod
363 1
    def _headers_from_twisted_response(response):
364 4
        headers = Headers()
365 4
        if response.length != UNKNOWN_LENGTH:
366 4
            headers[b'Content-Length'] = str(response.length).encode()
367 4
        headers.update(response.headers.getAllRawHeaders())
368 4
        return headers
369

370 4
    def _cb_bodyready(self, txresponse, request):
371 4
        headers_received_result = self._crawler.signals.send_catch_log(
372
            signal=signals.headers_received,
373
            headers=self._headers_from_twisted_response(txresponse),
374
            body_length=txresponse.length,
375
            request=request,
376
            spider=self._crawler.spider,
377
        )
378 4
        for handler, result in headers_received_result:
379 4
            if isinstance(result, Failure) and isinstance(result.value, StopDownload):
380 4
                logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
381
                             {"request": request, "handler": handler.__qualname__})
382 4
                txresponse._transport.stopProducing()
383 4
                with suppress(AttributeError):
384 4
                    txresponse._transport._producer.loseConnection()
385 4
                return {
386
                    "txresponse": txresponse,
387
                    "body": b"",
388
                    "flags": ["download_stopped"],
389
                    "certificate": None,
390
                    "ip_address": None,
391
                    "failure": result if result.value.fail else None,
392
                }
393

394
        # deliverBody hangs for responses without body
395 4
        if txresponse.length == 0:
396 4
            return {
397
                "txresponse": txresponse,
398
                "body": b"",
399
                "flags": None,
400
                "certificate": None,
401
                "ip_address": None,
402
            }
403

404 4
        maxsize = request.meta.get('download_maxsize', self._maxsize)
405 4
        warnsize = request.meta.get('download_warnsize', self._warnsize)
406 4
        expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
407 4
        fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
408

409 4
        if maxsize and expected_size > maxsize:
410 4
            warning_msg = ("Cancelling download of %(url)s: expected response "
411
                           "size (%(size)s) larger than download max size (%(maxsize)s).")
412 4
            warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
413

414 4
            logger.warning(warning_msg, warning_args)
415

416 4
            txresponse._transport._producer.loseConnection()
417 4
            raise defer.CancelledError(warning_msg % warning_args)
418

419 4
        if warnsize and expected_size > warnsize:
420 0
            logger.warning("Expected response size (%(size)s) larger than "
421
                           "download warn size (%(warnsize)s) in request %(request)s.",
422
                           {'size': expected_size, 'warnsize': warnsize, 'request': request})
423

424 4
        def _cancel(_):
425
            # Abort connection immediately.
426 4
            txresponse._transport._producer.abortConnection()
427

428 4
        d = defer.Deferred(_cancel)
429 4
        txresponse.deliverBody(
430
            _ResponseReader(
431
                finished=d,
432
                txresponse=txresponse,
433
                request=request,
434
                maxsize=maxsize,
435
                warnsize=warnsize,
436
                fail_on_dataloss=fail_on_dataloss,
437
                crawler=self._crawler,
438
            )
439
        )
440

441
        # save response for timeouts
442 4
        self._txresponse = txresponse
443

444 4
        return d
445

446 4
    def _cb_bodydone(self, result, request, url):
447 4
        headers = self._headers_from_twisted_response(result["txresponse"])
448 4
        respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
449 4
        try:
450 4
            version = result["txresponse"].version
451 4
            protocol = f"{to_unicode(version[0])}/{version[1]}.{version[2]}"
452 0
        except (AttributeError, TypeError, IndexError):
453 0
            protocol = None
454 4
        response = respcls(
455
            url=url,
456
            status=int(result["txresponse"].code),
457
            headers=headers,
458
            body=result["body"],
459
            flags=result["flags"],
460
            certificate=result["certificate"],
461
            ip_address=result["ip_address"],
462
            protocol=protocol,
463
        )
464 4
        if result.get("failure"):
465 4
            result["failure"].value.response = response
466 4
            return result["failure"]
467 4
        return response
468

469

470 4
@implementer(IBodyProducer)
471 1
class _RequestBodyProducer:
472

473 4
    def __init__(self, body):
474 4
        self.body = body
475 4
        self.length = len(body)
476

477 4
    def startProducing(self, consumer):
478 4
        consumer.write(self.body)
479 4
        return defer.succeed(None)
480

481 4
    def pauseProducing(self):
482 4
        pass
483

484 4
    def stopProducing(self):
485 0
        pass
486

487

488 4
class _ResponseReader(protocol.Protocol):
489

490 4
    def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
491 4
        self._finished = finished
492 4
        self._txresponse = txresponse
493 4
        self._request = request
494 4
        self._bodybuf = BytesIO()
495 4
        self._maxsize = maxsize
496 4
        self._warnsize = warnsize
497 4
        self._fail_on_dataloss = fail_on_dataloss
498 4
        self._fail_on_dataloss_warned = False
499 4
        self._reached_warnsize = False
500 4
        self._bytes_received = 0
501 4
        self._certificate = None
502 4
        self._ip_address = None
503 4
        self._crawler = crawler
504

505 4
    def _finish_response(self, flags=None, failure=None):
506 4
        self._finished.callback({
507
            "txresponse": self._txresponse,
508
            "body": self._bodybuf.getvalue(),
509
            "flags": flags,
510
            "certificate": self._certificate,
511
            "ip_address": self._ip_address,
512
            "failure": failure,
513
        })
514

515 4
    def connectionMade(self):
516 4
        if self._certificate is None:
517 4
            with suppress(AttributeError):
518 4
                self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
519

520 4
        if self._ip_address is None:
521 4
            self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
522

523 4
    def dataReceived(self, bodyBytes):
524
        # This maybe called several times after cancel was called with buffered data.
525 4
        if self._finished.called:
526 4
            return
527

528 4
        self._bodybuf.write(bodyBytes)
529 4
        self._bytes_received += len(bodyBytes)
530

531 4
        bytes_received_result = self._crawler.signals.send_catch_log(
532
            signal=signals.bytes_received,
533
            data=bodyBytes,
534
            request=self._request,
535
            spider=self._crawler.spider,
536
        )
537 4
        for handler, result in bytes_received_result:
538 4
            if isinstance(result, Failure) and isinstance(result.value, StopDownload):
539 4
                logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
540
                             {"request": self._request, "handler": handler.__qualname__})
541 4
                self.transport.stopProducing()
542 4
                self.transport._producer.loseConnection()
543 4
                failure = result if result.value.fail else None
544 4
                self._finish_response(flags=["download_stopped"], failure=failure)
545

546 4
        if self._maxsize and self._bytes_received > self._maxsize:
547 4
            logger.warning("Received (%(bytes)s) bytes larger than download "
548
                           "max size (%(maxsize)s) in request %(request)s.",
549
                           {'bytes': self._bytes_received,
550
                            'maxsize': self._maxsize,
551
                            'request': self._request})
552
            # Clear buffer earlier to avoid keeping data in memory for a long time.
553 4
            self._bodybuf.truncate(0)
554 4
            self._finished.cancel()
555

556 4
        if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
557 0
            self._reached_warnsize = True
558 0
            logger.warning("Received more bytes than download "
559
                           "warn size (%(warnsize)s) in request %(request)s.",
560
                           {'warnsize': self._warnsize,
561
                            'request': self._request})
562

563 4
    def connectionLost(self, reason):
564 4
        if self._finished.called:
565 4
            return
566

567 4
        if reason.check(ResponseDone):
568 4
            self._finish_response()
569 4
            return
570

571 4
        if reason.check(PotentialDataLoss):
572 4
            self._finish_response(flags=["partial"])
573 4
            return
574

575 4
        if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
576 4
            if not self._fail_on_dataloss:
577 4
                self._finish_response(flags=["dataloss"])
578 4
                return
579

580 4
            elif not self._fail_on_dataloss_warned:
581 4
                logger.warning("Got data loss in %s. If you want to process broken "
582
                               "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
583
                               " -- This message won't be shown in further requests",
584
                               self._txresponse.request.absoluteURI.decode())
585 4
                self._fail_on_dataloss_warned = True
586

587 4
        self._finished.errback(reason)

Read our documentation on viewing source code .

Loading