scrapy / scrapy
1 7
import re
2 7
import logging
3

4 7
from w3lib import html
5

6 7
from scrapy.exceptions import NotConfigured
7 7
from scrapy.http import HtmlResponse
8

9

10 7
logger = logging.getLogger(__name__)
11

12

13 7
class AjaxCrawlMiddleware:
14
    """
15
    Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
16
    For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
17
    """
18

19 7
    def __init__(self, settings):
20 7
        if not settings.getbool('AJAXCRAWL_ENABLED'):
21 7
            raise NotConfigured
22

23
        # XXX: Google parses at least first 100k bytes; scrapy's redirect
24
        # middleware parses first 4k. 4k turns out to be insufficient
25
        # for this middleware, and parsing 100k could be slow.
26
        # We use something in between (32K) by default.
27 7
        self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
28

29 7
    @classmethod
30 4
    def from_crawler(cls, crawler):
31 7
        return cls(crawler.settings)
32

33 7
    def process_response(self, request, response, spider):
34

35 7
        if not isinstance(response, HtmlResponse) or response.status != 200:
36 7
            return response
37

38 7
        if request.method != 'GET':
39
            # other HTTP methods are either not safe or don't have a body
40 7
            return response
41

42 7
        if 'ajax_crawlable' in request.meta:  # prevent loops
43 7
            return response
44

45 7
        if not self._has_ajax_crawlable_variant(response):
46 7
            return response
47

48
        # scrapy already handles #! links properly
49 7
        ajax_crawl_request = request.replace(url=request.url + '#!')
50 7
        logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
51
                     {'ajax_crawl_request': ajax_crawl_request, 'request': request},
52
                     extra={'spider': spider})
53

54 7
        ajax_crawl_request.meta['ajax_crawlable'] = True
55 7
        return ajax_crawl_request
56

57 7
    def _has_ajax_crawlable_variant(self, response):
58
        """
59
        Return True if a page without hash fragment could be "AJAX crawlable"
60
        according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
61
        """
62 7
        body = response.text[:self.lookup_bytes]
63 7
        return _has_ajaxcrawlable_meta(body)
64

65

66
# XXX: move it to w3lib?
67 7
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
68

69

70 7
def _has_ajaxcrawlable_meta(text):
71
    """
72
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
73
    True
74
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
75
    True
76
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
77
    False
78
    >>> _has_ajaxcrawlable_meta('<html></html>')
79
    False
80
    """
81

82
    # Stripping scripts and comments is slow (about 20x slower than
83
    # just checking if a string is in text); this is a quick fail-fast
84
    # path that should work for most pages.
85 7
    if 'fragment' not in text:
86 7
        return False
87 7
    if 'content' not in text:
88 0
        return False
89

90 7
    text = html.remove_tags_with_content(text, ('script', 'noscript'))
91 7
    text = html.replace_entities(text)
92 7
    text = html.remove_comments(text)
93 7
    return _ajax_crawlable_re.search(text) is not None

Read our documentation on viewing source code .

Loading