scrapy / scrapy
1
"""
2
Module for processing Sitemaps.
3

4
Note: The main purpose of this module is to provide support for the
5
SitemapSpider, its API is subject to change without notice.
6
"""
7

8 7
from urllib.parse import urljoin
9

10 7
import lxml.etree
11

12

13 7
class Sitemap:
14
    """Class to parse Sitemap (type=urlset) and Sitemap Index
15
    (type=sitemapindex) files"""
16

17 7
    def __init__(self, xmltext):
18 7
        xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
19 7
        self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
20 7
        rt = self._root.tag
21 7
        self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
22

23 7
    def __iter__(self):
24 7
        for elem in self._root.getchildren():
25 7
            d = {}
26 7
            for el in elem.getchildren():
27 7
                tag = el.tag
28 7
                name = tag.split('}', 1)[1] if '}' in tag else tag
29

30 7
                if name == 'link':
31 7
                    if 'href' in el.attrib:
32 7
                        d.setdefault('alternate', []).append(el.get('href'))
33
                else:
34 7
                    d[name] = el.text.strip() if el.text else ''
35

36 7
            if 'loc' in d:
37 7
                yield d
38

39

40 7
def sitemap_urls_from_robots(robots_text, base_url=None):
41
    """Return an iterator over all sitemap urls contained in the given
42
    robots.txt file
43
    """
44 7
    for line in robots_text.splitlines():
45 7
        if line.lstrip().lower().startswith('sitemap:'):
46 7
            url = line.split(':', 1)[1].strip()
47 7
            yield urljoin(base_url, url)

Read our documentation on viewing source code .

Loading