scrapy / scrapy

@@ -15,7 +15,11 @@
Loading
15 15
16 16
from scrapy import signals
17 17
from scrapy.core.scraper import Scraper
18 -
from scrapy.exceptions import DontCloseSpider, ScrapyDeprecationWarning
18 +
from scrapy.exceptions import (
19 +
    CloseSpider,
20 +
    DontCloseSpider,
21 +
    ScrapyDeprecationWarning,
22 +
)
19 23
from scrapy.http import Response, Request
20 24
from scrapy.settings import BaseSettings
21 25
from scrapy.spiders import Spider
@@ -325,14 +329,23 @@
Loading
325 329
        Called when a spider gets idle, i.e. when there are no remaining requests to download or schedule.
326 330
        It can be called multiple times. If a handler for the spider_idle signal raises a DontCloseSpider
327 331
        exception, the spider is not closed until the next loop and this function is guaranteed to be called
328 -
        (at least) once again.
332 +
        (at least) once again. A handler can raise CloseSpider to provide a custom closing reason.
329 333
        """
330 334
        assert self.spider is not None  # typing
331 -
        res = self.signals.send_catch_log(signals.spider_idle, spider=self.spider, dont_log=DontCloseSpider)
332 -
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
335 +
        expected_ex = (DontCloseSpider, CloseSpider)
336 +
        res = self.signals.send_catch_log(signals.spider_idle, spider=self.spider, dont_log=expected_ex)
337 +
        detected_ex = {
338 +
            ex: x.value
339 +
            for _, x in res
340 +
            for ex in expected_ex
341 +
            if isinstance(x, Failure) and isinstance(x.value, ex)
342 +
        }
343 +
        if DontCloseSpider in detected_ex:
333 344
            return None
334 345
        if self.spider_is_idle():
335 -
            self.close_spider(self.spider, reason='finished')
346 +
            ex = detected_ex.get(CloseSpider, CloseSpider(reason='finished'))
347 +
            assert isinstance(ex, CloseSpider)  # typing
348 +
            self.close_spider(self.spider, reason=ex.reason)
336 349
337 350
    def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred:
338 351
        """Close (cancel) spider and clear all its outstanding requests"""

@@ -11,6 +11,7 @@
Loading
11 11
import warnings
12 12
from datetime import datetime
13 13
from tempfile import NamedTemporaryFile
14 +
from typing import Any, Optional, Tuple
14 15
from urllib.parse import unquote, urlparse
15 16
16 17
from twisted.internet import defer, threads
@@ -47,6 +48,40 @@
Loading
47 48
    return builder(*preargs, uri, *args, **kwargs)
48 49
49 50
51 +
class ItemFilter:
52 +
    """
53 +
    This will be used by FeedExporter to decide if an item should be allowed
54 +
    to be exported to a particular feed.
55 +
56 +
    :param feed_options: feed specific options passed from FeedExporter
57 +
    :type feed_options: dict
58 +
    """
59 +
    feed_options: Optional[dict]
60 +
    item_classes: Tuple
61 +
62 +
    def __init__(self, feed_options: Optional[dict]) -> None:
63 +
        self.feed_options = feed_options
64 +
        if feed_options is not None:
65 +
            self.item_classes = tuple(
66 +
                load_object(item_class) for item_class in feed_options.get("item_classes") or ()
67 +
            )
68 +
        else:
69 +
            self.item_classes = tuple()
70 +
71 +
    def accepts(self, item: Any) -> bool:
72 +
        """
73 +
        Return ``True`` if `item` should be exported or ``False`` otherwise.
74 +
75 +
        :param item: scraped item which user wants to check if is acceptable
76 +
        :type item: :ref:`Scrapy items <topics-items>`
77 +
        :return: `True` if accepted, `False` otherwise
78 +
        :rtype: bool
79 +
        """
80 +
        if self.item_classes:
81 +
            return isinstance(item, self.item_classes)
82 +
        return True  # accept all items by default
83 +
84 +
50 85
class IFeedStorage(Interface):
51 86
    """Interface that all Feed Storages must implement"""
52 87
@@ -119,7 +154,7 @@
Loading
119 154
120 155
class S3FeedStorage(BlockingFeedStorage):
121 156
122 -
    def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
157 +
    def __init__(self, uri, access_key=None, secret_key=None, acl=None, endpoint_url=None, *,
123 158
                 feed_options=None):
124 159
        if not is_botocore_available():
125 160
            raise NotConfigured('missing botocore library')
@@ -129,11 +164,13 @@
Loading
129 164
        self.secret_key = u.password or secret_key
130 165
        self.keyname = u.path[1:]  # remove first "/"
131 166
        self.acl = acl
167 +
        self.endpoint_url = endpoint_url
132 168
        import botocore.session
133 169
        session = botocore.session.get_session()
134 170
        self.s3_client = session.create_client(
135 171
            's3', aws_access_key_id=self.access_key,
136 -
            aws_secret_access_key=self.secret_key)
172 +
            aws_secret_access_key=self.secret_key,
173 +
            endpoint_url=self.endpoint_url)
137 174
        if feed_options and feed_options.get('overwrite', True) is False:
138 175
            logger.warning('S3 does not support appending to files. To '
139 176
                           'suppress this warning, remove the overwrite '
@@ -147,6 +184,7 @@
Loading
147 184
            access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
148 185
            secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
149 186
            acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
187 +
            endpoint_url=crawler.settings['AWS_ENDPOINT_URL'] or None,
150 188
            feed_options=feed_options,
151 189
        )
152 190
@@ -216,7 +254,7 @@
Loading
216 254
217 255
218 256
class _FeedSlot:
219 -
    def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template):
257 +
    def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template, filter):
220 258
        self.file = file
221 259
        self.exporter = exporter
222 260
        self.storage = storage
@@ -226,6 +264,7 @@
Loading
226 264
        self.store_empty = store_empty
227 265
        self.uri_template = uri_template
228 266
        self.uri = uri
267 +
        self.filter = filter
229 268
        # flags
230 269
        self.itemcount = 0
231 270
        self._exporting = False
@@ -256,6 +295,7 @@
Loading
256 295
        self.settings = crawler.settings
257 296
        self.feeds = {}
258 297
        self.slots = []
298 +
        self.filters = {}
259 299
260 300
        if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
261 301
            raise NotConfigured
@@ -270,12 +310,14 @@
Loading
270 310
            uri = str(self.settings['FEED_URI'])  # handle pathlib.Path objects
271 311
            feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
272 312
            self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
313 +
            self.filters[uri] = self._load_filter(feed_options)
273 314
        # End: Backward compatibility for FEED_URI and FEED_FORMAT settings
274 315
275 316
        # 'FEEDS' setting takes precedence over 'FEED_URI'
276 317
        for uri, feed_options in self.settings.getdict('FEEDS').items():
277 318
            uri = str(uri)  # handle pathlib.Path objects
278 319
            self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
320 +
            self.filters[uri] = self._load_filter(feed_options)
279 321
280 322
        self.storages = self._load_components('FEED_STORAGES')
281 323
        self.exporters = self._load_components('FEED_EXPORTERS')
@@ -372,6 +414,7 @@
Loading
372 414
            store_empty=feed_options['store_empty'],
373 415
            batch_id=batch_id,
374 416
            uri_template=uri_template,
417 +
            filter=self.filters[uri_template]
375 418
        )
376 419
        if slot.store_empty:
377 420
            slot.start_exporting()
@@ -380,6 +423,10 @@
Loading
380 423
    def item_scraped(self, item, spider):
381 424
        slots = []
382 425
        for slot in self.slots:
426 +
            if not slot.filter.accepts(item):
427 +
                slots.append(slot)    # if slot doesn't accept item, continue with next slot
428 +
                continue
429 +
383 430
            slot.start_exporting()
384 431
            slot.exporter.export_item(item)
385 432
            slot.itemcount += 1
@@ -490,3 +537,8 @@
Loading
490 537
        uripar_function = load_object(uri_params) if uri_params else lambda x, y: None
491 538
        uripar_function(params, spider)
492 539
        return params
540 +
541 +
    def _load_filter(self, feed_options):
542 +
        # load the item filter if declared else load the default filter class
543 +
        item_filter_class = load_object(feed_options.get("item_filter", ItemFilter))
544 +
        return item_filter_class(feed_options)

@@ -1,5 +1,5 @@
Loading
1 1
"""Helper functions for working with signals"""
2 -
2 +
import collections
3 3
import logging
4 4
5 5
from twisted.internet.defer import DeferredList, Deferred
@@ -16,15 +16,13 @@
Loading
16 16
logger = logging.getLogger(__name__)
17 17
18 18
19 -
class _IgnoredException(Exception):
20 -
    pass
21 -
22 -
23 19
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
24 20
    """Like pydispatcher.robust.sendRobust but it also logs errors and returns
25 21
    Failures instead of exceptions.
26 22
    """
27 -
    dont_log = (named.pop('dont_log', _IgnoredException), StopDownload)
23 +
    dont_log = named.pop('dont_log', ())
24 +
    dont_log = tuple(dont_log) if isinstance(dont_log, collections.Sequence) else (dont_log,)
25 +
    dont_log += (StopDownload, )
28 26
    spider = named.get('spider', None)
29 27
    responses = []
30 28
    for receiver in liveReceivers(getAllReceivers(sender, signal)):
Files Coverage
scrapy 88.47%
Project Totals (163 files) 88.47%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading