scrapy / scrapy
1
"""
2
This module contains general purpose URL functions not found in the standard
3
library.
4

5
Some of the functions that used to be imported from this module have been moved
6
to the w3lib.url module. Always import those from there instead.
7
"""
8 7
import posixpath
9 7
import re
10 7
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
11

12
# scrapy.utils.url was moved to w3lib.url and import * ensures this
13
# move doesn't break old code
14 7
from w3lib.url import *
15 7
from w3lib.url import _safe_chars, _unquotepath  # noqa: F401
16 7
from scrapy.utils.python import to_unicode
17

18

19 7
def url_is_from_any_domain(url, domains):
20
    """Return True if the url belongs to any of the given domains"""
21 7
    host = parse_url(url).netloc.lower()
22 7
    if not host:
23 7
        return False
24 7
    domains = [d.lower() for d in domains]
25 7
    return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
26

27

28 7
def url_is_from_spider(url, spider):
29
    """Return True if the url belongs to the given spider"""
30 7
    return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
31

32

33 7
def url_has_any_extension(url, extensions):
34 7
    return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
35

36

37 7
def parse_url(url, encoding=None):
38
    """Return urlparsed url from the given argument (which could be an already
39
    parsed url)
40
    """
41 7
    if isinstance(url, ParseResult):
42 7
        return url
43 7
    return urlparse(to_unicode(url, encoding))
44

45

46 7
def escape_ajax(url):
47
    """
48
    Return the crawleable url according to:
49
    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
50

51
    >>> escape_ajax("www.example.com/ajax.html#!key=value")
52
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
53
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
54
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
55
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
56
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
57
    >>> escape_ajax("www.example.com/ajax.html#!")
58
    'www.example.com/ajax.html?_escaped_fragment_='
59

60
    URLs that are not "AJAX crawlable" (according to Google) returned as-is:
61

62
    >>> escape_ajax("www.example.com/ajax.html#key=value")
63
    'www.example.com/ajax.html#key=value'
64
    >>> escape_ajax("www.example.com/ajax.html#")
65
    'www.example.com/ajax.html#'
66
    >>> escape_ajax("www.example.com/ajax.html")
67
    'www.example.com/ajax.html'
68
    """
69 7
    defrag, frag = urldefrag(url)
70 7
    if not frag.startswith('!'):
71 7
        return url
72 7
    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
73

74

75 7
def add_http_if_no_scheme(url):
76
    """Add http as the default scheme if it is missing from the url."""
77 7
    match = re.match(r"^\w+://", url, flags=re.I)
78 7
    if not match:
79 7
        parts = urlparse(url)
80 7
        scheme = "http:" if parts.netloc else "http://"
81 7
        url = scheme + url
82

83 7
    return url
84

85

86 7
def _is_posix_path(string):
87 7
    return bool(
88
        re.match(
89
            r'''
90
            ^                   # start with...
91
            (
92
                \.              # ...a single dot,
93
                (
94
                    \. | [^/\.]+  # optionally followed by
95
                )?                # either a second dot or some characters
96
                |
97
                ~   # $HOME
98
            )?      # optional match of ".", ".." or ".blabla"
99
            /       # at least one "/" for a file path,
100
            .       # and something after the "/"
101
            ''',
102
            string,
103
            flags=re.VERBOSE,
104
        )
105
    )
106

107

108 7
def _is_windows_path(string):
109 7
    return bool(
110
        re.match(
111
            r'''
112
            ^
113
            (
114
                [a-z]:\\
115
                | \\\\
116
            )
117
            ''',
118
            string,
119
            flags=re.IGNORECASE | re.VERBOSE,
120
        )
121
    )
122

123

124 7
def _is_filesystem_path(string):
125 7
    return _is_posix_path(string) or _is_windows_path(string)
126

127

128 7
def guess_scheme(url):
129
    """Add an URL scheme if missing: file:// for filepath-like input or
130
    http:// otherwise."""
131 7
    if _is_filesystem_path(url):
132 7
        return any_to_uri(url)
133 7
    return add_http_if_no_scheme(url)
134

135

136 7
def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
137

138
    """Strip URL string from some of its components:
139

140
    - ``strip_credentials`` removes "user:password@"
141
    - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
142
      from http:// (resp. https://, ftp://) URLs
143
    - ``origin_only`` replaces path component with "/", also dropping
144
      query and fragment components ; it also strips credentials
145
    - ``strip_fragment`` drops any #fragment component
146
    """
147

148 7
    parsed_url = urlparse(url)
149 7
    netloc = parsed_url.netloc
150 7
    if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
151 7
        netloc = netloc.split('@')[-1]
152 7
    if strip_default_port and parsed_url.port:
153 7
        if (parsed_url.scheme, parsed_url.port) in (('http', 80),
154
                                                    ('https', 443),
155
                                                    ('ftp', 21)):
156 7
            netloc = netloc.replace(f':{parsed_url.port}', '')
157 7
    return urlunparse((
158
        parsed_url.scheme,
159
        netloc,
160
        '/' if origin_only else parsed_url.path,
161
        '' if origin_only else parsed_url.params,
162
        '' if origin_only else parsed_url.query,
163
        '' if strip_fragment else parsed_url.fragment
164
    ))

Read our documentation on viewing source code .

Loading