scrapy / scrapy

Compare ee2df97 ... +36 ... 7306a81

Coverage Reach
core/downloader/handlers/http11.py core/downloader/handlers/http2.py core/downloader/handlers/ftp.py core/downloader/handlers/__init__.py core/downloader/handlers/s3.py core/downloader/handlers/http10.py core/downloader/handlers/datauri.py core/downloader/handlers/file.py core/downloader/handlers/http.py core/downloader/__init__.py core/downloader/webclient.py core/downloader/middleware.py core/downloader/contextfactory.py core/downloader/tls.py core/http2/protocol.py core/http2/stream.py core/http2/agent.py core/engine.py core/scraper.py core/scheduler.py core/spidermw.py utils/python.py utils/defer.py utils/misc.py utils/conf.py utils/iterators.py utils/log.py utils/test.py utils/datatypes.py utils/deprecate.py utils/request.py utils/console.py utils/reactor.py utils/curl.py utils/project.py utils/url.py utils/signal.py utils/response.py utils/ssl.py utils/testproc.py utils/asyncgen.py utils/trackref.py utils/benchserver.py utils/testsite.py utils/serialize.py utils/spider.py utils/decorators.py utils/display.py utils/sitemap.py utils/gz.py utils/ftp.py utils/engine.py utils/boto.py utils/template.py utils/ossignal.py utils/versions.py utils/reqser.py utils/httpobj.py utils/job.py utils/py36.py extensions/feedexport.py extensions/httpcache.py extensions/memusage.py extensions/telnet.py extensions/throttle.py extensions/closespider.py extensions/debug.py extensions/logstats.py extensions/corestats.py extensions/spiderstate.py extensions/statsmailer.py extensions/memdebug.py commands/parse.py commands/genspider.py commands/startproject.py commands/__init__.py commands/check.py commands/runspider.py commands/fetch.py commands/shell.py commands/bench.py commands/settings.py commands/edit.py commands/crawl.py commands/version.py commands/view.py commands/list.py http/request/form.py http/request/__init__.py http/request/json_request.py http/request/rpc.py http/response/text.py http/response/__init__.py http/response/html.py http/response/xml.py http/cookies.py http/headers.py http/__init__.py http/common.py downloadermiddlewares/httpcache.py downloadermiddlewares/cookies.py downloadermiddlewares/robotstxt.py downloadermiddlewares/httpcompression.py downloadermiddlewares/redirect.py downloadermiddlewares/retry.py downloadermiddlewares/decompression.py downloadermiddlewares/httpproxy.py downloadermiddlewares/ajaxcrawl.py downloadermiddlewares/stats.py downloadermiddlewares/httpauth.py downloadermiddlewares/useragent.py downloadermiddlewares/downloadtimeout.py downloadermiddlewares/defaultheaders.py pipelines/files.py pipelines/media.py pipelines/images.py pipelines/__init__.py settings/__init__.py settings/default_settings.py spidermiddlewares/referer.py spidermiddlewares/offsite.py spidermiddlewares/depth.py spidermiddlewares/httperror.py spidermiddlewares/urllength.py spiders/crawl.py spiders/__init__.py spiders/feed.py spiders/sitemap.py spiders/init.py exporters.py contracts/__init__.py contracts/default.py crawler.py linkextractors/lxmlhtml.py linkextractors/__init__.py shell.py cmdline.py pqueues.py squeues.py mail.py robotstxt.py item.py resolver.py responsetypes.py dupefilters.py middleware.py statscollectors.py spiderloader.py logformatter.py selector/unified.py selector/__init__.py exceptions.py loader/__init__.py loader/processors.py loader/common.py signals.py signalmanager.py __init__.py link.py extension.py interfaces.py __main__.py

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -4,22 +4,20 @@
Loading
4 4
from scrapy.spiders import Spider
5 5
from scrapy.utils.defer import deferred_from_coro
6 6
from scrapy.utils.misc import arg_to_iter
7 -
from scrapy.utils.asyncgen import collect_asyncgen
8 7
9 8
10 9
logger = logging.getLogger(__name__)
11 10
12 11
13 12
def iterate_spider_output(result):
14 13
    if inspect.isasyncgen(result):
15 -
        d = deferred_from_coro(collect_asyncgen(result))
16 -
        d.addCallback(iterate_spider_output)
17 -
        return d
14 +
        return result
18 15
    elif inspect.iscoroutine(result):
19 16
        d = deferred_from_coro(result)
20 17
        d.addCallback(iterate_spider_output)
21 18
        return d
22 -
    return arg_to_iter(result)
19 +
    else:
20 +
        return arg_to_iter(deferred_from_coro(result))
23 21
24 22
25 23
def iter_spider_classes(module):

@@ -8,6 +8,7 @@
Loading
8 8
9 9
from scrapy.http import Request
10 10
from scrapy.exceptions import NotConfigured
11 +
from scrapy.utils.asyncgen import _process_iterable_universal
11 12
12 13
logger = logging.getLogger(__name__)
13 14
@@ -37,4 +38,10 @@
Loading
37 38
            else:
38 39
                return True
39 40
40 -
        return (r for r in result or () if _filter(r))
41 +
        @_process_iterable_universal
42 +
        async def process(result):
43 +
            async for r in result or ():
44 +
                if _filter(r):
45 +
                    yield r
46 +
47 +
        return process(result)

@@ -8,11 +8,12 @@
Loading
8 8
import sys
9 9
import warnings
10 10
import weakref
11 -
from collections.abc import Iterable
12 11
from functools import partial, wraps
13 12
from itertools import chain
13 +
from typing import AsyncIterable, Iterable, Union
14 14
15 15
from scrapy.exceptions import ScrapyDeprecationWarning
16 +
from scrapy.utils.asyncgen import as_async_generator
16 17
from scrapy.utils.decorators import deprecated
17 18
18 19
@@ -344,7 +345,7 @@
Loading
344 345
    def __init__(self, *args: Iterable):
345 346
        self.data = chain.from_iterable(args)
346 347
347 -
    def extend(self, *iterables: Iterable):
348 +
    def extend(self, *iterables: Iterable) -> None:
348 349
        self.data = chain(self.data, chain.from_iterable(iterables))
349 350
350 351
    def __iter__(self):
@@ -356,3 +357,27 @@
Loading
356 357
    @deprecated("scrapy.utils.python.MutableChain.__next__")
357 358
    def next(self):
358 359
        return self.__next__()
360 +
361 +
362 +
async def _async_chain(*iterables: Union[Iterable, AsyncIterable]):
363 +
    for it in iterables:
364 +
        async for o in as_async_generator(it):
365 +
            yield o
366 +
367 +
368 +
class MutableAsyncChain(AsyncIterable):
369 +
    """
370 +
    Similar to MutableChain but for async iterables
371 +
    """
372 +
373 +
    def __init__(self, *args: Union[Iterable, AsyncIterable]):
374 +
        self.data = _async_chain(*args)
375 +
376 +
    def extend(self, *iterables: Union[Iterable, AsyncIterable]) -> None:
377 +
        self.data = _async_chain(self.data, _async_chain(*iterables))
378 +
379 +
    def __aiter__(self):
380 +
        return self
381 +
382 +
    async def __anext__(self):
383 +
        return await self.data.__anext__()

@@ -7,6 +7,7 @@
Loading
7 7
import logging
8 8
9 9
from scrapy.http import Request
10 +
from scrapy.utils.asyncgen import _process_iterable_universal
10 11
11 12
logger = logging.getLogger(__name__)
12 13
@@ -49,10 +50,15 @@
Loading
49 50
                                         spider=spider)
50 51
            return True
51 52
52 -
        # base case (depth=0)
53 -
        if 'depth' not in response.meta:
54 -
            response.meta['depth'] = 0
55 -
            if self.verbose_stats:
56 -
                self.stats.inc_value('request_depth_count/0', spider=spider)
53 +
        @_process_iterable_universal
54 +
        async def process(result):
55 +
            # base case (depth=0)
56 +
            if 'depth' not in response.meta:
57 +
                response.meta['depth'] = 0
58 +
                if self.verbose_stats:
59 +
                    self.stats.inc_value('request_depth_count/0', spider=spider)
57 60
58 -
        return (r for r in result or () if _filter(r))
61 +
            async for r in result or ():
62 +
                if _filter(r):
63 +
                    yield r
64 +
        return process(result)

@@ -1,8 +1,63 @@
Loading
1 -
from collections.abc import AsyncIterable
1 +
import functools
2 +
import inspect
3 +
from typing import AsyncGenerator, AsyncIterable, Callable, Generator, Iterable, Union
2 4
3 5
4 6
async def collect_asyncgen(result: AsyncIterable):
5 7
    results = []
6 8
    async for x in result:
7 9
        results.append(x)
8 10
    return results
11 +
12 +
13 +
async def as_async_generator(it: Union[Iterable, AsyncIterable]) -> AsyncGenerator:
14 +
    """ Wraps an iterable (sync or async) into an async generator. """
15 +
    if isinstance(it, AsyncIterable):
16 +
        async for r in it:
17 +
            yield r
18 +
    else:
19 +
        for r in it:
20 +
            yield r
21 +
22 +
23 +
# https://stackoverflow.com/a/66170760/113586
24 +
def _process_iterable_universal(process_async: Callable):
25 +
    """ Takes a function that takes an async iterable, args and kwargs. Returns
26 +
    a function that takes any iterable, args and kwargs.
27 +
28 +
    Requires that process_async only awaits on the iterable and synchronous functions,
29 +
    so it's better to use this only in the Scrapy code itself.
30 +
    """
31 +
32 +
    # If this stops working, all internal uses can be just replaced with manually-written
33 +
    # process_sync functions.
34 +
35 +
    def process_sync(iterable: Iterable, *args, **kwargs) -> Generator:
36 +
        agen = process_async(as_async_generator(iterable), *args, **kwargs)
37 +
        if not inspect.isasyncgen(agen):
38 +
            raise ValueError(f"process_async returned wrong type {type(agen)}")
39 +
        sent = None
40 +
        while True:
41 +
            try:
42 +
                gen = agen.asend(sent)
43 +
                gen.send(None)
44 +
            except StopIteration as e:
45 +
                sent = yield e.value
46 +
            except StopAsyncIteration:
47 +
                return
48 +
            else:
49 +
                gen.throw(RuntimeError,
50 +
                          f"Synchronously-called function '{process_async.__name__}' has blocked, "
51 +
                          f"you can't use {_process_iterable_universal.__name__} with it.")
52 +
53 +
    @functools.wraps(process_async)
54 +
    def process(iterable: Union[Iterable, AsyncIterable], *args, **kwargs) -> Union[Generator, AsyncGenerator]:
55 +
        if isinstance(iterable, AsyncIterable):
56 +
            # call process_async directly
57 +
            return process_async(iterable, *args, **kwargs)
58 +
        if isinstance(iterable, Iterable):
59 +
            # convert process_async to process_sync
60 +
            return process_sync(iterable, *args, **kwargs)
61 +
        raise TypeError(f"Wrong iterable type {type(iterable)}")
62 +
63 +
    return process

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Learn more Showing 2 files with coverage changes found.

Changes in scrapy/core/downloader/handlers/__init__.py
-5
+1
+4
Loading file...
Changes in scrapy/core/downloader/__init__.py
-2
+1
+1
Loading file...

38 Commits

Hiding 22 contexual commits
+3 Files
+810
+753
+20
+37
Hiding 1 contexual commits
-2
+1
+1
Hiding 1 contexual commits
+4
+5
-1
Hiding 9 contexual commits
-3 Files
-671
-620
-18
-33
Files Coverage
scrapy 0.09% 88.49%
Project Totals (162 files) 88.49%
Loading