Use https link
1 |
"""
|
|
2 |
XPath selectors based on lxml
|
|
3 |
"""
|
|
4 |
|
|
5 |
import sys |
|
6 |
|
|
7 |
import six |
|
8 |
from lxml import etree, html |
|
9 |
|
|
10 |
from .utils import flatten, iflatten, extract_regex, shorten |
|
11 |
from .csstranslator import HTMLTranslator, GenericTranslator |
|
12 |
|
|
13 |
|
|
14 |
class CannotRemoveElementWithoutRoot(Exception): |
|
15 |
pass
|
|
16 |
|
|
17 |
|
|
18 |
class CannotRemoveElementWithoutParent(Exception): |
|
19 |
pass
|
|
20 |
|
|
21 |
|
|
22 |
class SafeXMLParser(etree.XMLParser): |
|
23 |
def __init__(self, *args, **kwargs): |
|
24 |
kwargs.setdefault('resolve_entities', False) |
|
25 |
super(SafeXMLParser, self).__init__(*args, **kwargs) |
|
26 |
|
|
27 |
|
|
28 |
_ctgroup = { |
|
29 |
'html': {'_parser': html.HTMLParser, |
|
30 |
'_csstranslator': HTMLTranslator(), |
|
31 |
'_tostring_method': 'html'}, |
|
32 |
'xml': {'_parser': SafeXMLParser, |
|
33 |
'_csstranslator': GenericTranslator(), |
|
34 |
'_tostring_method': 'xml'}, |
|
35 |
}
|
|
36 |
|
|
37 |
|
|
38 |
def _st(st): |
|
39 |
if st is None: |
|
40 |
return 'html' |
|
41 |
elif st in _ctgroup: |
|
42 |
return st |
|
43 |
else: |
|
44 |
raise ValueError('Invalid type: %s' % st) |
|
45 |
|
|
46 |
|
|
47 |
def create_root_node(text, parser_cls, base_url=None): |
|
48 |
"""Create root node for text using given parser class.
|
|
49 |
"""
|
|
50 |
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>' |
|
51 |
parser = parser_cls(recover=True, encoding='utf8') |
|
52 |
root = etree.fromstring(body, parser=parser, base_url=base_url) |
|
53 |
if root is None: |
|
54 |
root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url) |
|
55 |
return root |
|
56 |
|
|
57 |
|
|
58 |
class SelectorList(list): |
|
59 |
"""
|
|
60 |
The :class:`SelectorList` class is a subclass of the builtin ``list``
|
|
61 |
class, which provides a few additional methods.
|
|
62 |
"""
|
|
63 |
|
|
64 |
# __getslice__ is deprecated but `list` builtin implements it only in Py2
|
|
65 |
def __getslice__(self, i, j): |
|
66 |
o = super(SelectorList, self).__getslice__(i, j) |
|
67 |
return self.__class__(o) |
|
68 |
|
|
69 |
def __getitem__(self, pos): |
|
70 |
o = super(SelectorList, self).__getitem__(pos) |
|
71 |
return self.__class__(o) if isinstance(pos, slice) else o |
|
72 |
|
|
73 |
def __getstate__(self): |
|
74 |
raise TypeError("can't pickle SelectorList objects") |
|
75 |
|
|
76 |
def xpath(self, xpath, namespaces=None, **kwargs): |
|
77 |
"""
|
|
78 |
Call the ``.xpath()`` method for each element in this list and return
|
|
79 |
their results flattened as another :class:`SelectorList`.
|
|
80 |
|
|
81 |
``query`` is the same argument as the one in :meth:`Selector.xpath`
|
|
82 |
|
|
83 |
``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
|
|
84 |
for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
|
|
85 |
Contrary to ``register_namespace()``, these prefixes are not
|
|
86 |
saved for future calls.
|
|
87 |
|
|
88 |
Any additional named arguments can be used to pass values for XPath
|
|
89 |
variables in the XPath expression, e.g.::
|
|
90 |
|
|
91 |
selector.xpath('//a[href=$url]', url="http://www.example.com")
|
|
92 |
"""
|
|
93 |
return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])) |
|
94 |
|
|
95 |
def css(self, query): |
|
96 |
"""
|
|
97 |
Call the ``.css()`` method for each element in this list and return
|
|
98 |
their results flattened as another :class:`SelectorList`.
|
|
99 |
|
|
100 |
``query`` is the same argument as the one in :meth:`Selector.css`
|
|
101 |
"""
|
|
102 |
return self.__class__(flatten([x.css(query) for x in self])) |
|
103 |
|
|
104 |
def re(self, regex, replace_entities=True): |
|
105 |
"""
|
|
106 |
Call the ``.re()`` method for each element in this list and return
|
|
107 |
their results flattened, as a list of unicode strings.
|
|
108 |
|
|
109 |
By default, character entity references are replaced by their
|
|
110 |
corresponding character (except for ``&`` and ``<``.
|
|
111 |
Passing ``replace_entities`` as ``False`` switches off these
|
|
112 |
replacements.
|
|
113 |
"""
|
|
114 |
return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) |
|
115 |
|
|
116 |
def re_first(self, regex, default=None, replace_entities=True): |
|
117 |
"""
|
|
118 |
Call the ``.re()`` method for the first element in this list and
|
|
119 |
return the result in an unicode string. If the list is empty or the
|
|
120 |
regex doesn't match anything, return the default value (``None`` if
|
|
121 |
the argument is not provided).
|
|
122 |
|
|
123 |
By default, character entity references are replaced by their
|
|
124 |
corresponding character (except for ``&`` and ``<``.
|
|
125 |
Passing ``replace_entities`` as ``False`` switches off these
|
|
126 |
replacements.
|
|
127 |
"""
|
|
128 |
for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self): |
|
129 |
return el |
|
130 |
return default |
|
131 |
|
|
132 |
def getall(self): |
|
133 |
"""
|
|
134 |
Call the ``.get()`` method for each element is this list and return
|
|
135 |
their results flattened, as a list of unicode strings.
|
|
136 |
"""
|
|
137 |
return [x.get() for x in self] |
|
138 |
extract = getall |
|
139 |
|
|
140 |
def get(self, default=None): |
|
141 |
"""
|
|
142 |
Return the result of ``.get()`` for the first element in this list.
|
|
143 |
If the list is empty, return the default value.
|
|
144 |
"""
|
|
145 |
for x in self: |
|
146 |
return x.get() |
|
147 |
return default |
|
148 |
extract_first = get |
|
149 |
|
|
150 |
@property
|
|
151 |
def attrib(self): |
|
152 |
"""Return the attributes dictionary for the first element.
|
|
153 |
If the list is empty, return an empty dict.
|
|
154 |
"""
|
|
155 |
for x in self: |
|
156 |
return x.attrib |
|
157 |
return {} |
|
158 |
|
|
159 |
def remove(self): |
|
160 |
"""
|
|
161 |
Remove matched nodes from the parent for each element in this list.
|
|
162 |
"""
|
|
163 |
for x in self: |
|
164 |
x.remove() |
|
165 |
|
|
166 |
|
|
167 |
class Selector(object): |
|
168 |
"""
|
|
169 |
:class:`Selector` allows you to select parts of an XML or HTML text using CSS
|
|
170 |
or XPath expressions and extract data from it.
|
|
171 |
|
|
172 |
``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
|
|
173 |
|
|
174 |
``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
|
|
175 |
If ``type`` is ``None``, the selector defaults to ``"html"``.
|
|
176 |
|
|
177 |
``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
|
|
178 |
See [`lxml` documentation](https://lxml.de/api/index.html) ``lxml.etree.fromstring`` for more information.
|
|
179 |
"""
|
|
180 |
|
|
181 |
__slots__ = ['text', 'namespaces', 'type', '_expr', 'root', |
|
182 |
'__weakref__', '_parser', '_csstranslator', '_tostring_method'] |
|
183 |
|
|
184 |
_default_type = None |
|
185 |
_default_namespaces = { |
|
186 |
"re": "http://exslt.org/regular-expressions", |
|
187 |
|
|
188 |
# supported in libxslt:
|
|
189 |
# set:difference
|
|
190 |
# set:has-same-node
|
|
191 |
# set:intersection
|
|
192 |
# set:leading
|
|
193 |
# set:trailing
|
|
194 |
"set": "http://exslt.org/sets" |
|
195 |
}
|
|
196 |
_lxml_smart_strings = False |
|
197 |
selectorlist_cls = SelectorList |
|
198 |
|
|
199 |
def __init__(self, text=None, type=None, namespaces=None, root=None, |
|
200 |
base_url=None, _expr=None): |
|
201 |
self.type = st = _st(type or self._default_type) |
|
202 |
self._parser = _ctgroup[st]['_parser'] |
|
203 |
self._csstranslator = _ctgroup[st]['_csstranslator'] |
|
204 |
self._tostring_method = _ctgroup[st]['_tostring_method'] |
|
205 |
|
|
206 |
if text is not None: |
|
207 |
if not isinstance(text, six.text_type): |
|
208 |
msg = "text argument should be of type %s, got %s" % ( |
|
209 |
six.text_type, text.__class__) |
|
210 |
raise TypeError(msg) |
|
211 |
root = self._get_root(text, base_url) |
|
212 |
elif root is None: |
|
213 |
raise ValueError("Selector needs either text or root argument") |
|
214 |
|
|
215 |
self.namespaces = dict(self._default_namespaces) |
|
216 |
if namespaces is not None: |
|
217 |
self.namespaces.update(namespaces) |
|
218 |
self.root = root |
|
219 |
self._expr = _expr |
|
220 |
|
|
221 |
def __getstate__(self): |
|
222 |
raise TypeError("can't pickle Selector objects") |
|
223 |
|
|
224 |
def _get_root(self, text, base_url=None): |
|
225 |
return create_root_node(text, self._parser, base_url=base_url) |
|
226 |
|
|
227 |
def xpath(self, query, namespaces=None, **kwargs): |
|
228 |
"""
|
|
229 |
Find nodes matching the xpath ``query`` and return the result as a
|
|
230 |
:class:`SelectorList` instance with all elements flattened. List
|
|
231 |
elements implement :class:`Selector` interface too.
|
|
232 |
|
|
233 |
``query`` is a string containing the XPATH query to apply.
|
|
234 |
|
|
235 |
``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
|
|
236 |
for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
|
|
237 |
Contrary to ``register_namespace()``, these prefixes are not
|
|
238 |
saved for future calls.
|
|
239 |
|
|
240 |
Any additional named arguments can be used to pass values for XPath
|
|
241 |
variables in the XPath expression, e.g.::
|
|
242 |
|
|
243 |
selector.xpath('//a[href=$url]', url="http://www.example.com")
|
|
244 |
"""
|
|
245 |
try: |
|
246 |
xpathev = self.root.xpath |
|
247 |
except AttributeError: |
|
248 |
return self.selectorlist_cls([]) |
|
249 |
|
|
250 |
nsp = dict(self.namespaces) |
|
251 |
if namespaces is not None: |
|
252 |
nsp.update(namespaces) |
|
253 |
try: |
|
254 |
result = xpathev(query, namespaces=nsp, |
|
255 |
smart_strings=self._lxml_smart_strings, |
|
256 |
**kwargs) |
|
257 |
except etree.XPathError as exc: |
|
258 |
msg = u"XPath error: %s in %s" % (exc, query) |
|
259 |
msg = msg if six.PY3 else msg.encode('unicode_escape') |
|
260 |
six.reraise(ValueError, ValueError(msg), sys.exc_info()[2]) |
|
261 |
|
|
262 |
if type(result) is not list: |
|
263 |
result = [result] |
|
264 |
|
|
265 |
result = [self.__class__(root=x, _expr=query, |
|
266 |
namespaces=self.namespaces, |
|
267 |
type=self.type) |
|
268 |
for x in result] |
|
269 |
return self.selectorlist_cls(result) |
|
270 |
|
|
271 |
def css(self, query): |
|
272 |
"""
|
|
273 |
Apply the given CSS selector and return a :class:`SelectorList` instance.
|
|
274 |
|
|
275 |
``query`` is a string containing the CSS selector to apply.
|
|
276 |
|
|
277 |
In the background, CSS queries are translated into XPath queries using
|
|
278 |
`cssselect`_ library and run ``.xpath()`` method.
|
|
279 |
|
|
280 |
.. _cssselect: https://pypi.python.org/pypi/cssselect/
|
|
281 |
"""
|
|
282 |
return self.xpath(self._css2xpath(query)) |
|
283 |
|
|
284 |
def _css2xpath(self, query): |
|
285 |
return self._csstranslator.css_to_xpath(query) |
|
286 |
|
|
287 |
def re(self, regex, replace_entities=True): |
|
288 |
"""
|
|
289 |
Apply the given regex and return a list of unicode strings with the
|
|
290 |
matches.
|
|
291 |
|
|
292 |
``regex`` can be either a compiled regular expression or a string which
|
|
293 |
will be compiled to a regular expression using ``re.compile(regex)``.
|
|
294 |
|
|
295 |
By default, character entity references are replaced by their
|
|
296 |
corresponding character (except for ``&`` and ``<``).
|
|
297 |
Passing ``replace_entities`` as ``False`` switches off these
|
|
298 |
replacements.
|
|
299 |
"""
|
|
300 |
return extract_regex(regex, self.get(), replace_entities=replace_entities) |
|
301 |
|
|
302 |
def re_first(self, regex, default=None, replace_entities=True): |
|
303 |
"""
|
|
304 |
Apply the given regex and return the first unicode string which
|
|
305 |
matches. If there is no match, return the default value (``None`` if
|
|
306 |
the argument is not provided).
|
|
307 |
|
|
308 |
By default, character entity references are replaced by their
|
|
309 |
corresponding character (except for ``&`` and ``<``).
|
|
310 |
Passing ``replace_entities`` as ``False`` switches off these
|
|
311 |
replacements.
|
|
312 |
"""
|
|
313 |
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default) |
|
314 |
|
|
315 |
def get(self): |
|
316 |
"""
|
|
317 |
Serialize and return the matched nodes in a single unicode string.
|
|
318 |
Percent encoded content is unquoted.
|
|
319 |
"""
|
|
320 |
try: |
|
321 |
return etree.tostring(self.root, |
|
322 |
method=self._tostring_method, |
|
323 |
encoding='unicode', |
|
324 |
with_tail=False) |
|
325 |
except (AttributeError, TypeError): |
|
326 |
if self.root is True: |
|
327 |
return u'1' |
|
328 |
elif self.root is False: |
|
329 |
return u'0' |
|
330 |
else: |
|
331 |
return six.text_type(self.root) |
|
332 |
extract = get |
|
333 |
|
|
334 |
def getall(self): |
|
335 |
"""
|
|
336 |
Serialize and return the matched node in a 1-element list of unicode strings.
|
|
337 |
"""
|
|
338 |
return [self.get()] |
|
339 |
|
|
340 |
def register_namespace(self, prefix, uri): |
|
341 |
"""
|
|
342 |
Register the given namespace to be used in this :class:`Selector`.
|
|
343 |
Without registering namespaces you can't select or extract data from
|
|
344 |
non-standard namespaces. See :ref:`selector-examples-xml`.
|
|
345 |
"""
|
|
346 |
self.namespaces[prefix] = uri |
|
347 |
|
|
348 |
def remove_namespaces(self): |
|
349 |
"""
|
|
350 |
Remove all namespaces, allowing to traverse the document using
|
|
351 |
namespace-less xpaths. See :ref:`removing-namespaces`.
|
|
352 |
"""
|
|
353 |
for el in self.root.iter('*'): |
|
354 |
if el.tag.startswith('{'): |
|
355 |
el.tag = el.tag.split('}', 1)[1] |
|
356 |
# loop on element attributes also
|
|
357 |
for an in el.attrib.keys(): |
|
358 |
if an.startswith('{'): |
|
359 |
el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an) |
|
360 |
# remove namespace declarations
|
|
361 |
etree.cleanup_namespaces(self.root) |
|
362 |
|
|
363 |
def remove(self): |
|
364 |
"""
|
|
365 |
Remove matched nodes from the parent element.
|
|
366 |
"""
|
|
367 |
try: |
|
368 |
parent = self.root.getparent() |
|
369 |
except AttributeError: |
|
370 |
# 'str' object has no attribute 'getparent'
|
|
371 |
raise CannotRemoveElementWithoutRoot( |
|
372 |
"The node you're trying to remove has no root, "
|
|
373 |
"are you trying to remove a pseudo-element? "
|
|
374 |
"Try to use 'li' as a selector instead of 'li::text' or "
|
|
375 |
"'//li' instead of '//li/text()', for example."
|
|
376 |
)
|
|
377 |
|
|
378 |
try: |
|
379 |
parent.remove(self.root) |
|
380 |
except AttributeError: |
|
381 |
# 'NoneType' object has no attribute 'remove'
|
|
382 |
raise CannotRemoveElementWithoutParent( |
|
383 |
"The node you're trying to remove has no parent, "
|
|
384 |
"are you trying to remove a root element?"
|
|
385 |
)
|
|
386 |
|
|
387 |
@property
|
|
388 |
def attrib(self): |
|
389 |
"""Return the attributes dictionary for underlying element.
|
|
390 |
"""
|
|
391 |
return dict(self.root.attrib) |
|
392 |
|
|
393 |
def __bool__(self): |
|
394 |
"""
|
|
395 |
Return ``True`` if there is any real content selected or ``False``
|
|
396 |
otherwise. In other words, the boolean value of a :class:`Selector` is
|
|
397 |
given by the contents it selects.
|
|
398 |
"""
|
|
399 |
return bool(self.get()) |
|
400 |
__nonzero__ = __bool__ |
|
401 |
|
|
402 |
def __str__(self): |
|
403 |
data = repr(shorten(self.get(), width=40)) |
|
404 |
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data) |
|
405 |
__repr__ = __str__ |
Read our documentation on viewing source code .