scrapy / scrapy
1
"""
2
This module provides some useful functions for working with
3
scrapy.http.Request objects
4
"""
5

6 7
import hashlib
7 7
from typing import Dict, Iterable, Optional, Tuple, Union
8 7
from urllib.parse import urlunparse
9 7
from weakref import WeakKeyDictionary
10

11 7
from w3lib.http import basic_auth_header
12 7
from w3lib.url import canonicalize_url
13

14 7
from scrapy.http import Request
15 7
from scrapy.utils.httpobj import urlparse_cached
16 7
from scrapy.utils.python import to_bytes, to_unicode
17

18

19 7
_fingerprint_cache: "WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], str]]"
20 7
_fingerprint_cache = WeakKeyDictionary()
21

22

23 7
def request_fingerprint(
24
    request: Request,
25
    include_headers: Optional[Iterable[Union[bytes, str]]] = None,
26
    keep_fragments: bool = False,
27
):
28
    """
29
    Return the request fingerprint.
30

31
    The request fingerprint is a hash that uniquely identifies the resource the
32
    request points to. For example, take the following two urls:
33

34
    http://www.example.com/query?id=111&cat=222
35
    http://www.example.com/query?cat=222&id=111
36

37
    Even though those are two different URLs both point to the same resource
38
    and are equivalent (i.e. they should return the same response).
39

40
    Another example are cookies used to store session ids. Suppose the
41
    following page is only accessible to authenticated users:
42

43
    http://www.example.com/members/offers.html
44

45
    Lot of sites use a cookie to store the session id, which adds a random
46
    component to the HTTP Request and thus should be ignored when calculating
47
    the fingerprint.
48

49
    For this reason, request headers are ignored by default when calculating
50
    the fingeprint. If you want to include specific headers use the
51
    include_headers argument, which is a list of Request headers to include.
52

53
    Also, servers usually ignore fragments in urls when handling requests,
54
    so they are also ignored by default when calculating the fingerprint.
55
    If you want to include them, set the keep_fragments argument to True
56
    (for instance when handling requests with a headless browser).
57

58
    """
59 7
    headers: Optional[Tuple[bytes, ...]] = None
60 7
    if include_headers:
61 7
        headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
62 7
    cache = _fingerprint_cache.setdefault(request, {})
63 7
    cache_key = (headers, keep_fragments)
64 7
    if cache_key not in cache:
65 7
        fp = hashlib.sha1()
66 7
        fp.update(to_bytes(request.method))
67 7
        fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
68 7
        fp.update(request.body or b'')
69 7
        if headers:
70 7
            for hdr in headers:
71 7
                if hdr in request.headers:
72 7
                    fp.update(hdr)
73 7
                    for v in request.headers.getlist(hdr):
74 7
                        fp.update(v)
75 7
        cache[cache_key] = fp.hexdigest()
76 7
    return cache[cache_key]
77

78

79 7
def request_authenticate(request: Request, username: str, password: str) -> None:
80
    """Autenticate the given request (in place) using the HTTP basic access
81
    authentication mechanism (RFC 2617) and the given username and password
82
    """
83 7
    request.headers['Authorization'] = basic_auth_header(username, password)
84

85

86 7
def request_httprepr(request: Request) -> bytes:
87
    """Return the raw HTTP representation (as bytes) of the given request.
88
    This is provided only for reference since it's not the actual stream of
89
    bytes that will be send when performing the request (that's controlled
90
    by Twisted).
91
    """
92 7
    parsed = urlparse_cached(request)
93 7
    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
94 7
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
95 7
    s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
96 7
    if request.headers:
97 7
        s += request.headers.to_string() + b"\r\n"
98 7
    s += b"\r\n"
99 7
    s += request.body
100 7
    return s
101

102

103 7
def referer_str(request: Request) -> Optional[str]:
104
    """ Return Referer HTTP header suitable for logging. """
105 7
    referrer = request.headers.get('Referer')
106 7
    if referrer is None:
107 7
        return referrer
108 7
    return to_unicode(referrer, errors='replace')

Read our documentation on viewing source code .

Loading