scrapy / scrapy
1
"""Helper functions which don't fit anywhere else"""
2 7
import ast
3 7
import inspect
4 7
import os
5 7
import re
6 7
import hashlib
7 7
import warnings
8 7
from collections import deque
9 7
from contextlib import contextmanager
10 7
from importlib import import_module
11 7
from pkgutil import iter_modules
12

13 7
from w3lib.html import replace_entities
14

15 7
from scrapy.utils.datatypes import LocalWeakReferencedCache
16 7
from scrapy.utils.python import flatten, to_unicode
17 7
from scrapy.item import _BaseItem
18 7
from scrapy.utils.deprecate import ScrapyDeprecationWarning
19

20

21 7
_ITERABLE_SINGLE_VALUES = dict, _BaseItem, str, bytes
22

23

24 7
def arg_to_iter(arg):
25
    """Convert an argument to an iterable. The argument can be a None, single
26
    value, or an iterable.
27

28
    Exception: if arg is a dict, [arg] will be returned
29
    """
30 7
    if arg is None:
31 7
        return []
32 7
    elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
33 7
        return arg
34
    else:
35 7
        return [arg]
36

37

38 7
def load_object(path):
39
    """Load an object given its absolute object path, and return it.
40

41
    The object can be the import path of a class, function, variable or an
42
    instance, e.g. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'.
43

44
    If ``path`` is not a string, but is a callable object, such as a class or
45
    a function, then return it as is.
46
    """
47

48 7
    if not isinstance(path, str):
49 7
        if callable(path):
50 7
            return path
51
        else:
52 7
            raise TypeError("Unexpected argument type, expected string "
53
                            "or object, got: %s" % type(path))
54

55 7
    try:
56 7
        dot = path.rindex('.')
57 7
    except ValueError:
58 7
        raise ValueError(f"Error loading object '{path}': not a full path")
59

60 7
    module, name = path[:dot], path[dot + 1:]
61 7
    mod = import_module(module)
62

63 7
    try:
64 7
        obj = getattr(mod, name)
65 7
    except AttributeError:
66 7
        raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
67

68 7
    return obj
69

70

71 7
def walk_modules(path):
72
    """Loads a module and all its submodules from the given module path and
73
    returns them. If *any* module throws an exception while importing, that
74
    exception is thrown back.
75

76
    For example: walk_modules('scrapy.utils')
77
    """
78

79 7
    mods = []
80 7
    mod = import_module(path)
81 7
    mods.append(mod)
82 7
    if hasattr(mod, '__path__'):
83 7
        for _, subpath, ispkg in iter_modules(mod.__path__):
84 7
            fullpath = path + '.' + subpath
85 7
            if ispkg:
86 7
                mods += walk_modules(fullpath)
87
            else:
88 7
                submod = import_module(fullpath)
89 7
                mods.append(submod)
90 7
    return mods
91

92

93 7
def extract_regex(regex, text, encoding='utf-8'):
94
    """Extract a list of unicode strings from the given text/encoding using the following policies:
95

96
    * if the regex contains a named group called "extract" that will be returned
97
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
98
    * if the regex doesn't contain any group the entire regex matching is returned
99
    """
100 7
    warnings.warn(
101
        "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
102
        ScrapyDeprecationWarning,
103
        stacklevel=2
104
    )
105

106 7
    if isinstance(regex, str):
107 7
        regex = re.compile(regex, re.UNICODE)
108

109 7
    try:
110 7
        strings = [regex.search(text).group('extract')]   # named group
111 7
    except Exception:
112 7
        strings = regex.findall(text)    # full regex or numbered groups
113 7
    strings = flatten(strings)
114

115 7
    if isinstance(text, str):
116 7
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
117
    else:
118 7
        return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
119
                for s in strings]
120

121

122 7
def md5sum(file):
123
    """Calculate the md5 checksum of a file-like object without reading its
124
    whole content in memory.
125

126
    >>> from io import BytesIO
127
    >>> md5sum(BytesIO(b'file content to hash'))
128
    '784406af91dd5a54fbb9c84c2236595a'
129
    """
130 7
    m = hashlib.md5()
131 3
    while True:
132 7
        d = file.read(8096)
133 7
        if not d:
134 7
            break
135 7
        m.update(d)
136 7
    return m.hexdigest()
137

138

139 7
def rel_has_nofollow(rel):
140
    """Return True if link rel attribute has nofollow type"""
141 7
    return rel is not None and 'nofollow' in rel.split()
142

143

144 7
def create_instance(objcls, settings, crawler, *args, **kwargs):
145
    """Construct a class instance using its ``from_crawler`` or
146
    ``from_settings`` constructors, if available.
147

148
    At least one of ``settings`` and ``crawler`` needs to be different from
149
    ``None``. If ``settings `` is ``None``, ``crawler.settings`` will be used.
150
    If ``crawler`` is ``None``, only the ``from_settings`` constructor will be
151
    tried.
152

153
    ``*args`` and ``**kwargs`` are forwarded to the constructors.
154

155
    Raises ``ValueError`` if both ``settings`` and ``crawler`` are ``None``.
156

157
    .. versionchanged:: 2.2
158
       Raises ``TypeError`` if the resulting instance is ``None`` (e.g. if an
159
       extension has not been implemented correctly).
160
    """
161 7
    if settings is None:
162 7
        if crawler is None:
163 7
            raise ValueError("Specify at least one of settings and crawler.")
164 7
        settings = crawler.settings
165 7
    if crawler and hasattr(objcls, 'from_crawler'):
166 7
        instance = objcls.from_crawler(crawler, *args, **kwargs)
167 7
        method_name = 'from_crawler'
168 7
    elif hasattr(objcls, 'from_settings'):
169 7
        instance = objcls.from_settings(settings, *args, **kwargs)
170 7
        method_name = 'from_settings'
171
    else:
172 7
        instance = objcls(*args, **kwargs)
173 7
        method_name = '__new__'
174 7
    if instance is None:
175 7
        raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
176 7
    return instance
177

178

179 7
@contextmanager
180 4
def set_environ(**kwargs):
181
    """Temporarily set environment variables inside the context manager and
182
    fully restore previous environment afterwards
183
    """
184

185 7
    original_env = {k: os.environ.get(k) for k in kwargs}
186 7
    os.environ.update(kwargs)
187 7
    try:
188 7
        yield
189
    finally:
190 7
        for k, v in original_env.items():
191 7
            if v is None:
192 7
                del os.environ[k]
193
            else:
194 7
                os.environ[k] = v
195

196

197 7
def walk_callable(node):
198
    """Similar to ``ast.walk``, but walks only function body and skips nested
199
    functions defined within the node.
200
    """
201 7
    todo = deque([node])
202 7
    walked_func_def = False
203 7
    while todo:
204 7
        node = todo.popleft()
205 7
        if isinstance(node, ast.FunctionDef):
206 7
            if walked_func_def:
207 7
                continue
208 7
            walked_func_def = True
209 7
        todo.extend(ast.iter_child_nodes(node))
210 7
        yield node
211

212

213 7
_generator_callbacks_cache = LocalWeakReferencedCache(limit=128)
214

215

216 7
def is_generator_with_return_value(callable):
217
    """
218
    Returns True if a callable is a generator function which includes a
219
    'return' statement with a value different than None, False otherwise
220
    """
221 7
    if callable in _generator_callbacks_cache:
222 7
        return _generator_callbacks_cache[callable]
223

224 7
    def returns_none(return_node):
225 7
        value = return_node.value
226 7
        return value is None or isinstance(value, ast.NameConstant) and value.value is None
227

228 7
    if inspect.isgeneratorfunction(callable):
229 7
        code = re.sub(r"^[\t ]+", "", inspect.getsource(callable))
230 7
        tree = ast.parse(code)
231 7
        for node in walk_callable(tree):
232 7
            if isinstance(node, ast.Return) and not returns_none(node):
233 7
                _generator_callbacks_cache[callable] = True
234 7
                return _generator_callbacks_cache[callable]
235

236 7
    _generator_callbacks_cache[callable] = False
237 7
    return _generator_callbacks_cache[callable]
238

239

240 7
def warn_on_generator_with_return_value(spider, callable):
241
    """
242
    Logs a warning if a callable is a generator function and includes
243
    a 'return' statement with a value different than None
244
    """
245 7
    try:
246 7
        if is_generator_with_return_value(callable):
247 7
            warnings.warn(
248
                f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
249
                'a generator and includes a "return" statement with a value '
250
                'different than None. This could lead to unexpected behaviour. Please see '
251
                'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
252
                'for details about the semantics of the "return" statement within generators',
253
                stacklevel=2,
254
            )
255 7
    except IndentationError:
256 7
        callable_name = spider.__class__.__name__ + "." + callable.__name__
257 7
        warnings.warn(
258
            f'Unable to determine whether or not "{callable_name}" is a generator with a return value. '
259
            'This will not prevent your code from working, but it prevents Scrapy from detecting '
260
            f'potential issues in your implementation of "{callable_name}". Please, report this in the '
261
            'Scrapy issue tracker (https://github.com/scrapy/scrapy/issues), '
262
            f'including the code of "{callable_name}"',
263
            stacklevel=2,
264
        )

Read our documentation on viewing source code .

Loading