scrapy / scrapy
1
"""
2
This module contains essential stuff that should've come with Python itself ;)
3
"""
4 7
import errno
5 7
import gc
6 7
import inspect
7 7
import re
8 7
import sys
9 7
import warnings
10 7
import weakref
11 7
from functools import partial, wraps
12 7
from itertools import chain
13

14 7
from scrapy.exceptions import ScrapyDeprecationWarning
15 7
from scrapy.utils.decorators import deprecated
16

17

18 7
def flatten(x):
19
    """flatten(sequence) -> list
20

21
    Returns a single, flat list which contains all elements retrieved
22
    from the sequence and all recursively contained sub-sequences
23
    (iterables).
24

25
    Examples:
26
    >>> [1, 2, [3,4], (5,6)]
27
    [1, 2, [3, 4], (5, 6)]
28
    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
29
    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
30
    >>> flatten(["foo", "bar"])
31
    ['foo', 'bar']
32
    >>> flatten(["foo", ["baz", 42], "bar"])
33
    ['foo', 'baz', 42, 'bar']
34
    """
35 7
    return list(iflatten(x))
36

37

38 7
def iflatten(x):
39
    """iflatten(sequence) -> iterator
40

41
    Similar to ``.flatten()``, but returns iterator instead"""
42 7
    for el in x:
43 7
        if is_listlike(el):
44 7
            for el_ in iflatten(el):
45 7
                yield el_
46
        else:
47 7
            yield el
48

49

50 7
def is_listlike(x):
51
    """
52
    >>> is_listlike("foo")
53
    False
54
    >>> is_listlike(5)
55
    False
56
    >>> is_listlike(b"foo")
57
    False
58
    >>> is_listlike([b"foo"])
59
    True
60
    >>> is_listlike((b"foo",))
61
    True
62
    >>> is_listlike({})
63
    True
64
    >>> is_listlike(set())
65
    True
66
    >>> is_listlike((x for x in range(3)))
67
    True
68
    >>> is_listlike(range(5))
69
    True
70
    """
71 7
    return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
72

73

74 7
def unique(list_, key=lambda x: x):
75
    """efficient function to uniquify a list preserving item order"""
76 7
    seen = set()
77 7
    result = []
78 7
    for item in list_:
79 7
        seenkey = key(item)
80 7
        if seenkey in seen:
81 7
            continue
82 7
        seen.add(seenkey)
83 7
        result.append(item)
84 7
    return result
85

86

87 7
def to_unicode(text, encoding=None, errors='strict'):
88
    """Return the unicode representation of a bytes object ``text``. If
89
    ``text`` is already an unicode object, return it as-is."""
90 7
    if isinstance(text, str):
91 7
        return text
92 7
    if not isinstance(text, (bytes, str)):
93 7
        raise TypeError('to_unicode must receive a bytes or str '
94
                        f'object, got {type(text).__name__}')
95 7
    if encoding is None:
96 7
        encoding = 'utf-8'
97 7
    return text.decode(encoding, errors)
98

99

100 7
def to_bytes(text, encoding=None, errors='strict'):
101
    """Return the binary representation of ``text``. If ``text``
102
    is already a bytes object, return it as-is."""
103 7
    if isinstance(text, bytes):
104 7
        return text
105 7
    if not isinstance(text, str):
106 7
        raise TypeError('to_bytes must receive a str or bytes '
107
                        f'object, got {type(text).__name__}')
108 7
    if encoding is None:
109 7
        encoding = 'utf-8'
110 7
    return text.encode(encoding, errors)
111

112

113 7
@deprecated('to_unicode')
114 7
def to_native_str(text, encoding=None, errors='strict'):
115
    """ Return str representation of ``text``. """
116 0
    return to_unicode(text, encoding, errors)
117

118

119 7
def re_rsearch(pattern, text, chunk_size=1024):
120
    """
121
    This function does a reverse search in a text using a regular expression
122
    given in the attribute 'pattern'.
123
    Since the re module does not provide this functionality, we have to find for
124
    the expression into chunks of text extracted from the end (for the sake of efficiency).
125
    At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for
126
    the pattern. If the pattern is not found, another chunk is extracted, and another
127
    search is performed.
128
    This process continues until a match is found, or until the whole file is read.
129
    In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing
130
    the start position of the match, and the ending (regarding the entire text).
131
    """
132

133 7
    def _chunk_iter():
134 7
        offset = len(text)
135 3
        while True:
136 7
            offset -= (chunk_size * 1024)
137 7
            if offset <= 0:
138 7
                break
139 0
            yield (text[offset:], offset)
140 7
        yield (text, 0)
141

142 7
    if isinstance(pattern, str):
143 0
        pattern = re.compile(pattern)
144

145 7
    for chunk, offset in _chunk_iter():
146 7
        matches = [match for match in pattern.finditer(chunk)]
147 7
        if matches:
148 7
            start, end = matches[-1].span()
149 7
            return offset + start, offset + end
150 7
    return None
151

152

153 7
def memoizemethod_noargs(method):
154
    """Decorator to cache the result of a method (without arguments) using a
155
    weak reference to its object
156
    """
157 7
    cache = weakref.WeakKeyDictionary()
158

159 7
    @wraps(method)
160 4
    def new_method(self, *args, **kwargs):
161 7
        if self not in cache:
162 7
            cache[self] = method(self, *args, **kwargs)
163 7
        return cache[self]
164

165 7
    return new_method
166

167

168 7
_BINARYCHARS = {to_bytes(chr(i)) for i in range(32)} - {b"\0", b"\t", b"\n", b"\r"}
169 7
_BINARYCHARS |= {ord(ch) for ch in _BINARYCHARS}
170

171

172 7
def binary_is_text(data):
173
    """ Returns ``True`` if the given ``data`` argument (a ``bytes`` object)
174
    does not contain unprintable control characters.
175
    """
176 7
    if not isinstance(data, bytes):
177 0
        raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
178 7
    return all(c not in _BINARYCHARS for c in data)
179

180

181 7
def _getargspec_py23(func):
182
    """_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
183
                                                        defaults)
184

185
    Was identical to inspect.getargspec() in python2, but uses
186
    inspect.getfullargspec() for python3 behind the scenes to avoid
187
    DeprecationWarning.
188

189
    >>> def f(a, b=2, *ar, **kw):
190
    ...     pass
191

192
    >>> _getargspec_py23(f)
193
    ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
194
    """
195 7
    return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])
196

197

198 7
def get_func_args(func, stripself=False):
199
    """Return the argument name list of a callable"""
200 7
    if inspect.isfunction(func):
201 7
        spec = inspect.getfullargspec(func)
202 7
        func_args = spec.args + spec.kwonlyargs
203 7
    elif inspect.isclass(func):
204 7
        return get_func_args(func.__init__, True)
205 7
    elif inspect.ismethod(func):
206 7
        return get_func_args(func.__func__, True)
207 7
    elif inspect.ismethoddescriptor(func):
208 7
        return []
209 7
    elif isinstance(func, partial):
210 7
        return [x for x in get_func_args(func.func)[len(func.args):]
211
                if not (func.keywords and x in func.keywords)]
212 7
    elif hasattr(func, '__call__'):
213 7
        if inspect.isroutine(func):
214 7
            return []
215 7
        elif getattr(func, '__name__', None) == '__call__':
216 7
            return []
217
        else:
218 7
            return get_func_args(func.__call__, True)
219
    else:
220 0
        raise TypeError(f'{type(func)} is not callable')
221 7
    if stripself:
222 7
        func_args.pop(0)
223 7
    return func_args
224

225

226 7
def get_spec(func):
227
    """Returns (args, kwargs) tuple for a function
228
    >>> import re
229
    >>> get_spec(re.match)
230
    (['pattern', 'string'], {'flags': 0})
231

232
    >>> class Test:
233
    ...     def __call__(self, val):
234
    ...         pass
235
    ...     def method(self, val, flags=0):
236
    ...         pass
237

238
    >>> get_spec(Test)
239
    (['self', 'val'], {})
240

241
    >>> get_spec(Test.method)
242
    (['self', 'val'], {'flags': 0})
243

244
    >>> get_spec(Test().method)
245
    (['self', 'val'], {'flags': 0})
246
    """
247

248 7
    if inspect.isfunction(func) or inspect.ismethod(func):
249 7
        spec = _getargspec_py23(func)
250 7
    elif hasattr(func, '__call__'):
251 7
        spec = _getargspec_py23(func.__call__)
252
    else:
253 0
        raise TypeError(f'{type(func)} is not callable')
254

255 7
    defaults = spec.defaults or []
256

257 7
    firstdefault = len(spec.args) - len(defaults)
258 7
    args = spec.args[:firstdefault]
259 7
    kwargs = dict(zip(spec.args[firstdefault:], defaults))
260 7
    return args, kwargs
261

262

263 7
def equal_attributes(obj1, obj2, attributes):
264
    """Compare two objects attributes"""
265
    # not attributes given return False by default
266 7
    if not attributes:
267 7
        return False
268

269 7
    temp1, temp2 = object(), object()
270 7
    for attr in attributes:
271
        # support callables like itemgetter
272 7
        if callable(attr):
273 7
            if attr(obj1) != attr(obj2):
274 7
                return False
275 7
        elif getattr(obj1, attr, temp1) != getattr(obj2, attr, temp2):
276 7
            return False
277
    # all attributes equal
278 7
    return True
279

280

281 7
class WeakKeyCache:
282

283 7
    def __init__(self, default_factory):
284 7
        warnings.warn("The WeakKeyCache class is deprecated", category=ScrapyDeprecationWarning, stacklevel=2)
285 7
        self.default_factory = default_factory
286 7
        self._weakdict = weakref.WeakKeyDictionary()
287

288 7
    def __getitem__(self, key):
289 7
        if key not in self._weakdict:
290 7
            self._weakdict[key] = self.default_factory(key)
291 7
        return self._weakdict[key]
292

293

294 7
@deprecated
295 4
def retry_on_eintr(function, *args, **kw):
296
    """Run a function and retry it while getting EINTR errors"""
297 0
    while True:
298 0
        try:
299 0
            return function(*args, **kw)
300 0
        except IOError as e:
301 7
            if e.errno != errno.EINTR:
302 0
                raise
303

304

305 7
def without_none_values(iterable):
306
    """Return a copy of ``iterable`` with all ``None`` entries removed.
307

308
    If ``iterable`` is a mapping, return a dictionary where all pairs that have
309
    value ``None`` have been removed.
310
    """
311 7
    try:
312 7
        return {k: v for k, v in iterable.items() if v is not None}
313 7
    except AttributeError:
314 7
        return type(iterable)((v for v in iterable if v is not None))
315

316

317 7
def global_object_name(obj):
318
    """
319
    Return full name of a global object.
320

321
    >>> from scrapy import Request
322
    >>> global_object_name(Request)
323
    'scrapy.http.request.Request'
324
    """
325 7
    return f"{obj.__module__}.{obj.__name__}"
326

327

328 7
if hasattr(sys, "pypy_version_info"):
329 0
    def garbage_collect():
330
        # Collecting weakreferences can take two collections on PyPy.
331 0
        gc.collect()
332 0
        gc.collect()
333
else:
334 7
    def garbage_collect():
335 7
        gc.collect()
336

337

338 7
class MutableChain:
339
    """
340
    Thin wrapper around itertools.chain, allowing to add iterables "in-place"
341
    """
342

343 7
    def __init__(self, *args):
344 7
        self.data = chain.from_iterable(args)
345

346 7
    def extend(self, *iterables):
347 7
        self.data = chain(self.data, chain.from_iterable(iterables))
348

349 7
    def __iter__(self):
350 7
        return self
351

352 7
    def __next__(self):
353 7
        return next(self.data)
354

355 7
    @deprecated("scrapy.utils.python.MutableChain.__next__")
356 4
    def next(self):
357 7
        return self.__next__()

Read our documentation on viewing source code .

Loading