1
import re
2
import six
3
from w3lib.html import replace_entities as w3lib_replace_entities
4

5

6
def flatten(x):
7
    """flatten(sequence) -> list
8
    Returns a single, flat list which contains all elements retrieved
9
    from the sequence and all recursively contained sub-sequences
10
    (iterables).
11
    Examples:
12
    >>> [1, 2, [3,4], (5,6)]
13
    [1, 2, [3, 4], (5, 6)]
14
    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
15
    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
16
    >>> flatten(["foo", "bar"])
17
    ['foo', 'bar']
18
    >>> flatten(["foo", ["baz", 42], "bar"])
19
    ['foo', 'baz', 42, 'bar']
20
    """
21
    return list(iflatten(x))
22

23

24
def iflatten(x):
25
    """iflatten(sequence) -> Iterator
26
    Similar to ``.flatten()``, but returns iterator instead"""
27
    for el in x:
28
        if _is_listlike(el):
29
            for el_ in flatten(el):
30
                yield el_
31
        else:
32
            yield el
33

34

35
def _is_listlike(x):
36
    """
37
    >>> _is_listlike("foo")
38
    False
39
    >>> _is_listlike(5)
40
    False
41
    >>> _is_listlike(b"foo")
42
    False
43
    >>> _is_listlike([b"foo"])
44
    True
45
    >>> _is_listlike((b"foo",))
46
    True
47
    >>> _is_listlike({})
48
    True
49
    >>> _is_listlike(set())
50
    True
51
    >>> _is_listlike((x for x in range(3)))
52
    True
53
    >>> _is_listlike(six.moves.xrange(5))
54
    True
55
    """
56
    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
57

58

59
def extract_regex(regex, text, replace_entities=True):
60
    """Extract a list of unicode strings from the given text/encoding using the following policies:
61
    * if the regex contains a named group called "extract" that will be returned
62
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
63
    * if the regex doesn't contain any group the entire regex matching is returned
64
    """
65
    if isinstance(regex, six.string_types):
66
        regex = re.compile(regex, re.UNICODE)
67

68
    if 'extract' in regex.groupindex:
69
        # named group
70
        try:
71
            extracted = regex.search(text).group('extract')
72
        except AttributeError:
73
            strings = []
74
        else:
75
            strings = [extracted] if extracted is not None else []
76
    else:
77
        # full regex or numbered groups
78
        strings = regex.findall(text)
79

80
    strings = flatten(strings)
81
    if not replace_entities:
82
        return strings
83
    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
84

85

86
def shorten(text, width, suffix='...'):
87
    """Truncate the given text to fit in the given width."""
88
    if len(text) <= width:
89
        return text
90
    if width > len(suffix):
91
        return text[:width-len(suffix)] + suffix
92
    if width >= 0:
93
        return suffix[len(suffix)-width:]
94
    raise ValueError('width must be equal or greater than 0')

Read our documentation on viewing source code .

Loading