scrapy / scrapy
1
"""
2
This module provides some useful functions for working with
3
scrapy.http.Response objects
4
"""
5 7
import os
6 7
import webbrowser
7 7
import tempfile
8 7
from typing import Any, Callable, Iterable, Optional, Tuple, Union
9 7
from weakref import WeakKeyDictionary
10

11 7
import scrapy
12 7
from scrapy.http.response import Response
13

14 7
from twisted.web import http
15 7
from scrapy.utils.python import to_bytes, to_unicode
16 7
from w3lib import html
17

18

19 7
_baseurl_cache: "WeakKeyDictionary[Response, str]" = WeakKeyDictionary()
20

21

22 7
def get_base_url(response: "scrapy.http.response.text.TextResponse") -> str:
23
    """Return the base url of the given response, joined with the response url"""
24 7
    if response not in _baseurl_cache:
25 7
        text = response.text[0:4096]
26 7
        _baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding)
27 7
    return _baseurl_cache[response]
28

29

30 7
_metaref_cache: "WeakKeyDictionary[Response, Union[Tuple[None, None], Tuple[float, str]]]" = WeakKeyDictionary()
31

32

33 7
def get_meta_refresh(
34
    response: "scrapy.http.response.text.TextResponse",
35
    ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'),
36
) -> Union[Tuple[None, None], Tuple[float, str]]:
37
    """Parse the http-equiv refrsh parameter from the given response"""
38 7
    if response not in _metaref_cache:
39 7
        text = response.text[0:4096]
40 7
        _metaref_cache[response] = html.get_meta_refresh(
41
            text, response.url, response.encoding, ignore_tags=ignore_tags)
42 7
    return _metaref_cache[response]
43

44

45 7
def response_status_message(status: Union[bytes, float, int, str]) -> str:
46
    """Return status code plus status text descriptive message
47
    """
48 7
    status_int = int(status)
49 7
    message = http.RESPONSES.get(status_int, "Unknown Status")
50 7
    return f'{status_int} {to_unicode(message)}'
51

52

53 7
def response_httprepr(response: Response) -> bytes:
54
    """Return raw HTTP representation (as bytes) of the given response. This
55
    is provided only for reference, since it's not the exact stream of bytes
56
    that was received (that's not exposed by Twisted).
57
    """
58 7
    values = [
59
        b"HTTP/1.1 ",
60
        to_bytes(str(response.status)),
61
        b" ",
62
        to_bytes(http.RESPONSES.get(response.status, b'')),
63
        b"\r\n",
64
    ]
65 7
    if response.headers:
66 7
        values.extend([response.headers.to_string(), b"\r\n"])
67 7
    values.extend([b"\r\n", response.body])
68 7
    return b"".join(values)
69

70

71 7
def open_in_browser(
72
    response: Union["scrapy.http.response.html.HtmlResponse", "scrapy.http.response.text.TextResponse"],
73
    _openfunc: Callable[[str], Any] = webbrowser.open,
74
) -> Any:
75
    """Open the given response in a local web browser, populating the <base>
76
    tag for external links to work
77
    """
78 7
    from scrapy.http import HtmlResponse, TextResponse
79
    # XXX: this implementation is a bit dirty and could be improved
80 7
    body = response.body
81 7
    if isinstance(response, HtmlResponse):
82 7
        if b'<base' not in body:
83 7
            repl = f'<head><base href="{response.url}">'
84 7
            body = body.replace(b'<head>', to_bytes(repl))
85 7
        ext = '.html'
86 7
    elif isinstance(response, TextResponse):
87 0
        ext = '.txt'
88
    else:
89 0
        raise TypeError("Unsupported response type: "
90
                        f"{response.__class__.__name__}")
91 7
    fd, fname = tempfile.mkstemp(ext)
92 7
    os.write(fd, body)
93 7
    os.close(fd)
94 7
    return _openfunc(f"file://{fname}")

Read our documentation on viewing source code .

Loading