Use https link
1 |
import re |
|
2 |
import six |
|
3 |
from w3lib.html import replace_entities as w3lib_replace_entities |
|
4 |
|
|
5 |
|
|
6 |
def flatten(x): |
|
7 |
"""flatten(sequence) -> list
|
|
8 |
Returns a single, flat list which contains all elements retrieved
|
|
9 |
from the sequence and all recursively contained sub-sequences
|
|
10 |
(iterables).
|
|
11 |
Examples:
|
|
12 |
>>> [1, 2, [3,4], (5,6)]
|
|
13 |
[1, 2, [3, 4], (5, 6)]
|
|
14 |
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
|
|
15 |
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
|
|
16 |
>>> flatten(["foo", "bar"])
|
|
17 |
['foo', 'bar']
|
|
18 |
>>> flatten(["foo", ["baz", 42], "bar"])
|
|
19 |
['foo', 'baz', 42, 'bar']
|
|
20 |
"""
|
|
21 |
return list(iflatten(x)) |
|
22 |
|
|
23 |
|
|
24 |
def iflatten(x): |
|
25 |
"""iflatten(sequence) -> Iterator
|
|
26 |
Similar to ``.flatten()``, but returns iterator instead"""
|
|
27 |
for el in x: |
|
28 |
if _is_listlike(el): |
|
29 |
for el_ in flatten(el): |
|
30 |
yield el_ |
|
31 |
else: |
|
32 |
yield el |
|
33 |
|
|
34 |
|
|
35 |
def _is_listlike(x): |
|
36 |
"""
|
|
37 |
>>> _is_listlike("foo")
|
|
38 |
False
|
|
39 |
>>> _is_listlike(5)
|
|
40 |
False
|
|
41 |
>>> _is_listlike(b"foo")
|
|
42 |
False
|
|
43 |
>>> _is_listlike([b"foo"])
|
|
44 |
True
|
|
45 |
>>> _is_listlike((b"foo",))
|
|
46 |
True
|
|
47 |
>>> _is_listlike({})
|
|
48 |
True
|
|
49 |
>>> _is_listlike(set())
|
|
50 |
True
|
|
51 |
>>> _is_listlike((x for x in range(3)))
|
|
52 |
True
|
|
53 |
>>> _is_listlike(six.moves.xrange(5))
|
|
54 |
True
|
|
55 |
"""
|
|
56 |
return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) |
|
57 |
|
|
58 |
|
|
59 |
def extract_regex(regex, text, replace_entities=True): |
|
60 |
"""Extract a list of unicode strings from the given text/encoding using the following policies:
|
|
61 |
* if the regex contains a named group called "extract" that will be returned
|
|
62 |
* if the regex contains multiple numbered groups, all those will be returned (flattened)
|
|
63 |
* if the regex doesn't contain any group the entire regex matching is returned
|
|
64 |
"""
|
|
65 |
if isinstance(regex, six.string_types): |
|
66 |
regex = re.compile(regex, re.UNICODE) |
|
67 |
|
|
68 |
if 'extract' in regex.groupindex: |
|
69 |
# named group
|
|
70 |
try: |
|
71 |
extracted = regex.search(text).group('extract') |
|
72 |
except AttributeError: |
|
73 |
strings = [] |
|
74 |
else: |
|
75 |
strings = [extracted] if extracted is not None else [] |
|
76 |
else: |
|
77 |
# full regex or numbered groups
|
|
78 |
strings = regex.findall(text) |
|
79 |
|
|
80 |
strings = flatten(strings) |
|
81 |
if not replace_entities: |
|
82 |
return strings |
|
83 |
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] |
|
84 |
|
|
85 |
|
|
86 |
def shorten(text, width, suffix='...'): |
|
87 |
"""Truncate the given text to fit in the given width."""
|
|
88 |
if len(text) <= width: |
|
89 |
return text |
|
90 |
if width > len(suffix): |
|
91 |
return text[:width-len(suffix)] + suffix |
|
92 |
if width >= 0: |
|
93 |
return suffix[len(suffix)-width:] |
|
94 |
raise ValueError('width must be equal or greater than 0') |
Read our documentation on viewing source code .