1
"""
2
XPath selectors based on lxml
3
"""
4

5
import sys
6

7
import six
8
from lxml import etree, html
9

10
from .utils import flatten, iflatten, extract_regex, shorten
11
from .csstranslator import HTMLTranslator, GenericTranslator
12

13

14
class CannotRemoveElementWithoutRoot(Exception):
15
    pass
16

17

18
class CannotRemoveElementWithoutParent(Exception):
19
    pass
20

21

22
class SafeXMLParser(etree.XMLParser):
23
    def __init__(self, *args, **kwargs):
24
        kwargs.setdefault('resolve_entities', False)
25
        super(SafeXMLParser, self).__init__(*args, **kwargs)
26

27

28
_ctgroup = {
29
    'html': {'_parser': html.HTMLParser,
30
             '_csstranslator': HTMLTranslator(),
31
             '_tostring_method': 'html'},
32
    'xml': {'_parser': SafeXMLParser,
33
            '_csstranslator': GenericTranslator(),
34
            '_tostring_method': 'xml'},
35
}
36

37

38
def _st(st):
39
    if st is None:
40
        return 'html'
41
    elif st in _ctgroup:
42
        return st
43
    else:
44
        raise ValueError('Invalid type: %s' % st)
45

46

47
def create_root_node(text, parser_cls, base_url=None):
48
    """Create root node for text using given parser class.
49
    """
50
    body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
51
    parser = parser_cls(recover=True, encoding='utf8')
52
    root = etree.fromstring(body, parser=parser, base_url=base_url)
53
    if root is None:
54
        root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
55
    return root
56

57

58
class SelectorList(list):
59
    """
60
    The :class:`SelectorList` class is a subclass of the builtin ``list``
61
    class, which provides a few additional methods.
62
    """
63

64
    # __getslice__ is deprecated but `list` builtin implements it only in Py2
65
    def __getslice__(self, i, j):
66
        o = super(SelectorList, self).__getslice__(i, j)
67
        return self.__class__(o)
68

69
    def __getitem__(self, pos):
70
        o = super(SelectorList, self).__getitem__(pos)
71
        return self.__class__(o) if isinstance(pos, slice) else o
72

73
    def __getstate__(self):
74
        raise TypeError("can't pickle SelectorList objects")
75

76
    def xpath(self, xpath, namespaces=None, **kwargs):
77
        """
78
        Call the ``.xpath()`` method for each element in this list and return
79
        their results flattened as another :class:`SelectorList`.
80

81
        ``query`` is the same argument as the one in :meth:`Selector.xpath`
82

83
        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
84
        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
85
        Contrary to ``register_namespace()``, these prefixes are not
86
        saved for future calls.
87

88
        Any additional named arguments can be used to pass values for XPath
89
        variables in the XPath expression, e.g.::
90

91
            selector.xpath('//a[href=$url]', url="http://www.example.com")
92
        """
93
        return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
94

95
    def css(self, query):
96
        """
97
        Call the ``.css()`` method for each element in this list and return
98
        their results flattened as another :class:`SelectorList`.
99

100
        ``query`` is the same argument as the one in :meth:`Selector.css`
101
        """
102
        return self.__class__(flatten([x.css(query) for x in self]))
103

104
    def re(self, regex, replace_entities=True):
105
        """
106
        Call the ``.re()`` method for each element in this list and return
107
        their results flattened, as a list of unicode strings.
108

109
        By default, character entity references are replaced by their
110
        corresponding character (except for ``&amp;`` and ``&lt;``.
111
        Passing ``replace_entities`` as ``False`` switches off these
112
        replacements.
113
        """
114
        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
115

116
    def re_first(self, regex, default=None, replace_entities=True):
117
        """
118
        Call the ``.re()`` method for the first element in this list and
119
        return the result in an unicode string. If the list is empty or the
120
        regex doesn't match anything, return the default value (``None`` if
121
        the argument is not provided).
122

123
        By default, character entity references are replaced by their
124
        corresponding character (except for ``&amp;`` and ``&lt;``.
125
        Passing ``replace_entities`` as ``False`` switches off these
126
        replacements.
127
        """
128
        for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
129
            return el
130
        return default
131

132
    def getall(self):
133
        """
134
        Call the ``.get()`` method for each element is this list and return
135
        their results flattened, as a list of unicode strings.
136
        """
137
        return [x.get() for x in self]
138
    extract = getall
139

140
    def get(self, default=None):
141
        """
142
        Return the result of ``.get()`` for the first element in this list.
143
        If the list is empty, return the default value.
144
        """
145
        for x in self:
146
            return x.get()
147
        return default
148
    extract_first = get
149

150
    @property
151
    def attrib(self):
152
        """Return the attributes dictionary for the first element.
153
        If the list is empty, return an empty dict.
154
        """
155
        for x in self:
156
            return x.attrib
157
        return {}
158

159
    def remove(self):
160
        """
161
        Remove matched nodes from the parent for each element in this list.
162
        """
163
        for x in self:
164
            x.remove()
165

166

167
class Selector(object):
168
    """
169
    :class:`Selector` allows you to select parts of an XML or HTML text using CSS
170
    or XPath expressions and extract data from it.
171

172
    ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
173

174
    ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
175
    If ``type`` is ``None``, the selector defaults to ``"html"``.
176

177
    ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
178
    See [`lxml` documentation](https://lxml.de/api/index.html) ``lxml.etree.fromstring`` for more information.
179
    """
180

181
    __slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
182
                 '__weakref__', '_parser', '_csstranslator', '_tostring_method']
183

184
    _default_type = None
185
    _default_namespaces = {
186
        "re": "http://exslt.org/regular-expressions",
187

188
        # supported in libxslt:
189
        # set:difference
190
        # set:has-same-node
191
        # set:intersection
192
        # set:leading
193
        # set:trailing
194
        "set": "http://exslt.org/sets"
195
    }
196
    _lxml_smart_strings = False
197
    selectorlist_cls = SelectorList
198

199
    def __init__(self, text=None, type=None, namespaces=None, root=None,
200
                 base_url=None, _expr=None):
201
        self.type = st = _st(type or self._default_type)
202
        self._parser = _ctgroup[st]['_parser']
203
        self._csstranslator = _ctgroup[st]['_csstranslator']
204
        self._tostring_method = _ctgroup[st]['_tostring_method']
205

206
        if text is not None:
207
            if not isinstance(text, six.text_type):
208
                msg = "text argument should be of type %s, got %s" % (
209
                    six.text_type, text.__class__)
210
                raise TypeError(msg)
211
            root = self._get_root(text, base_url)
212
        elif root is None:
213
            raise ValueError("Selector needs either text or root argument")
214

215
        self.namespaces = dict(self._default_namespaces)
216
        if namespaces is not None:
217
            self.namespaces.update(namespaces)
218
        self.root = root
219
        self._expr = _expr
220

221
    def __getstate__(self):
222
        raise TypeError("can't pickle Selector objects")
223

224
    def _get_root(self, text, base_url=None):
225
        return create_root_node(text, self._parser, base_url=base_url)
226

227
    def xpath(self, query, namespaces=None, **kwargs):
228
        """
229
        Find nodes matching the xpath ``query`` and return the result as a
230
        :class:`SelectorList` instance with all elements flattened. List
231
        elements implement :class:`Selector` interface too.
232

233
        ``query`` is a string containing the XPATH query to apply.
234

235
        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
236
        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
237
        Contrary to ``register_namespace()``, these prefixes are not
238
        saved for future calls.
239

240
        Any additional named arguments can be used to pass values for XPath
241
        variables in the XPath expression, e.g.::
242

243
            selector.xpath('//a[href=$url]', url="http://www.example.com")
244
        """
245
        try:
246
            xpathev = self.root.xpath
247
        except AttributeError:
248
            return self.selectorlist_cls([])
249

250
        nsp = dict(self.namespaces)
251
        if namespaces is not None:
252
            nsp.update(namespaces)
253
        try:
254
            result = xpathev(query, namespaces=nsp,
255
                             smart_strings=self._lxml_smart_strings,
256
                             **kwargs)
257
        except etree.XPathError as exc:
258
            msg = u"XPath error: %s in %s" % (exc, query)
259
            msg = msg if six.PY3 else msg.encode('unicode_escape')
260
            six.reraise(ValueError, ValueError(msg), sys.exc_info()[2])
261

262
        if type(result) is not list:
263
            result = [result]
264

265
        result = [self.__class__(root=x, _expr=query,
266
                                 namespaces=self.namespaces,
267
                                 type=self.type)
268
                  for x in result]
269
        return self.selectorlist_cls(result)
270

271
    def css(self, query):
272
        """
273
        Apply the given CSS selector and return a :class:`SelectorList` instance.
274

275
        ``query`` is a string containing the CSS selector to apply.
276

277
        In the background, CSS queries are translated into XPath queries using
278
        `cssselect`_ library and run ``.xpath()`` method.
279

280
        .. _cssselect: https://pypi.python.org/pypi/cssselect/
281
        """
282
        return self.xpath(self._css2xpath(query))
283

284
    def _css2xpath(self, query):
285
        return self._csstranslator.css_to_xpath(query)
286

287
    def re(self, regex, replace_entities=True):
288
        """
289
        Apply the given regex and return a list of unicode strings with the
290
        matches.
291

292
        ``regex`` can be either a compiled regular expression or a string which
293
        will be compiled to a regular expression using ``re.compile(regex)``.
294

295
        By default, character entity references are replaced by their
296
        corresponding character (except for ``&amp;`` and ``&lt;``).
297
        Passing ``replace_entities`` as ``False`` switches off these
298
        replacements.
299
        """
300
        return extract_regex(regex, self.get(), replace_entities=replace_entities)
301

302
    def re_first(self, regex, default=None, replace_entities=True):
303
        """
304
        Apply the given regex and return the first unicode string which
305
        matches. If there is no match, return the default value (``None`` if
306
        the argument is not provided).
307

308
        By default, character entity references are replaced by their
309
        corresponding character (except for ``&amp;`` and ``&lt;``).
310
        Passing ``replace_entities`` as ``False`` switches off these
311
        replacements.
312
        """
313
        return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
314

315
    def get(self):
316
        """
317
        Serialize and return the matched nodes in a single unicode string.
318
        Percent encoded content is unquoted.
319
        """
320
        try:
321
            return etree.tostring(self.root,
322
                                  method=self._tostring_method,
323
                                  encoding='unicode',
324
                                  with_tail=False)
325
        except (AttributeError, TypeError):
326
            if self.root is True:
327
                return u'1'
328
            elif self.root is False:
329
                return u'0'
330
            else:
331
                return six.text_type(self.root)
332
    extract = get
333

334
    def getall(self):
335
        """
336
        Serialize and return the matched node in a 1-element list of unicode strings.
337
        """
338
        return [self.get()]
339

340
    def register_namespace(self, prefix, uri):
341
        """
342
        Register the given namespace to be used in this :class:`Selector`.
343
        Without registering namespaces you can't select or extract data from
344
        non-standard namespaces. See :ref:`selector-examples-xml`.
345
        """
346
        self.namespaces[prefix] = uri
347

348
    def remove_namespaces(self):
349
        """
350
        Remove all namespaces, allowing to traverse the document using
351
        namespace-less xpaths. See :ref:`removing-namespaces`.
352
        """
353
        for el in self.root.iter('*'):
354
            if el.tag.startswith('{'):
355
                el.tag = el.tag.split('}', 1)[1]
356
            # loop on element attributes also
357
            for an in el.attrib.keys():
358
                if an.startswith('{'):
359
                    el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
360
        # remove namespace declarations
361
        etree.cleanup_namespaces(self.root)
362

363
    def remove(self):
364
        """
365
        Remove matched nodes from the parent element.
366
        """
367
        try:
368
            parent = self.root.getparent()
369
        except AttributeError:
370
            # 'str' object has no attribute 'getparent'
371
            raise CannotRemoveElementWithoutRoot(
372
                "The node you're trying to remove has no root, "
373
                "are you trying to remove a pseudo-element? "
374
                "Try to use 'li' as a selector instead of 'li::text' or "
375
                "'//li' instead of '//li/text()', for example."
376
            )
377

378
        try:
379
            parent.remove(self.root)
380
        except AttributeError:
381
            # 'NoneType' object has no attribute 'remove'
382
            raise CannotRemoveElementWithoutParent(
383
                "The node you're trying to remove has no parent, "
384
                "are you trying to remove a root element?"
385
            )
386

387
    @property
388
    def attrib(self):
389
        """Return the attributes dictionary for underlying element.
390
        """
391
        return dict(self.root.attrib)
392

393
    def __bool__(self):
394
        """
395
        Return ``True`` if there is any real content selected or ``False``
396
        otherwise.  In other words, the boolean value of a :class:`Selector` is
397
        given by the contents it selects.
398
        """
399
        return bool(self.get())
400
    __nonzero__ = __bool__
401

402
    def __str__(self):
403
        data = repr(shorten(self.get(), width=40))
404
        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
405
    __repr__ = __str__

Read our documentation on viewing source code .

Loading