scrapy / scrapy
1 7
import numbers
2 7
import os
3 7
import sys
4 7
import warnings
5 7
from configparser import ConfigParser
6 7
from operator import itemgetter
7

8 7
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
9

10 7
from scrapy.settings import BaseSettings
11 7
from scrapy.utils.deprecate import update_classpath
12 7
from scrapy.utils.python import without_none_values
13

14

15 7
def build_component_list(compdict, custom=None, convert=update_classpath):
16
    """Compose a component list from a { class: order } dictionary."""
17

18 7
    def _check_components(complist):
19 7
        if len({convert(c) for c in complist}) != len(complist):
20 7
            raise ValueError(f'Some paths in {complist!r} convert to the same object, '
21
                             'please update your settings')
22

23 7
    def _map_keys(compdict):
24 7
        if isinstance(compdict, BaseSettings):
25 7
            compbs = BaseSettings()
26 7
            for k, v in compdict.items():
27 7
                prio = compdict.getpriority(k)
28 7
                if compbs.getpriority(convert(k)) == prio:
29 7
                    raise ValueError(f'Some paths in {list(compdict.keys())!r} '
30
                                     'convert to the same '
31
                                     'object, please update your settings'
32
                                     )
33
                else:
34 7
                    compbs.set(convert(k), v, priority=prio)
35 7
            return compbs
36
        else:
37 7
            _check_components(compdict)
38 7
            return {convert(k): v for k, v in compdict.items()}
39

40 7
    def _validate_values(compdict):
41
        """Fail if a value in the components dict is not a real number or None."""
42 7
        for name, value in compdict.items():
43 7
            if value is not None and not isinstance(value, numbers.Real):
44 7
                raise ValueError(f'Invalid value {value} for component {name}, '
45
                                 'please provide a real number or None instead')
46

47
    # BEGIN Backward compatibility for old (base, custom) call signature
48 7
    if isinstance(custom, (list, tuple)):
49 7
        _check_components(custom)
50 7
        return type(custom)(convert(c) for c in custom)
51

52 7
    if custom is not None:
53 7
        compdict.update(custom)
54
    # END Backward compatibility
55

56 7
    _validate_values(compdict)
57 7
    compdict = without_none_values(_map_keys(compdict))
58 7
    return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
59

60

61 7
def arglist_to_dict(arglist):
62
    """Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
63
    dict
64
    """
65 7
    return dict(x.split('=', 1) for x in arglist)
66

67

68 7
def closest_scrapy_cfg(path='.', prevpath=None):
69
    """Return the path to the closest scrapy.cfg file by traversing the current
70
    directory and its parents
71
    """
72 7
    if path == prevpath:
73 7
        return ''
74 7
    path = os.path.abspath(path)
75 7
    cfgfile = os.path.join(path, 'scrapy.cfg')
76 7
    if os.path.exists(cfgfile):
77 7
        return cfgfile
78 7
    return closest_scrapy_cfg(os.path.dirname(path), path)
79

80

81 7
def init_env(project='default', set_syspath=True):
82
    """Initialize environment to use command-line tool from inside a project
83
    dir. This sets the Scrapy settings module and modifies the Python path to
84
    be able to locate the project module.
85
    """
86 7
    cfg = get_config()
87 7
    if cfg.has_option('settings', project):
88 0
        os.environ['SCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
89 7
    closest = closest_scrapy_cfg()
90 7
    if closest:
91 0
        projdir = os.path.dirname(closest)
92 7
        if set_syspath and projdir not in sys.path:
93 0
            sys.path.append(projdir)
94

95

96 7
def get_config(use_closest=True):
97
    """Get Scrapy config file as a ConfigParser"""
98 7
    sources = get_sources(use_closest)
99 7
    cfg = ConfigParser()
100 7
    cfg.read(sources)
101 7
    return cfg
102

103

104 7
def get_sources(use_closest=True):
105 7
    xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
106 7
    sources = [
107
        '/etc/scrapy.cfg',
108
        r'c:\scrapy\scrapy.cfg',
109
        xdg_config_home + '/scrapy.cfg',
110
        os.path.expanduser('~/.scrapy.cfg'),
111
    ]
112 7
    if use_closest:
113 7
        sources.append(closest_scrapy_cfg())
114 7
    return sources
115

116

117 7
def feed_complete_default_values_from_settings(feed, settings):
118 7
    out = feed.copy()
119 7
    out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
120 7
    out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
121 7
    out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
122 7
    out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
123 7
    out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
124 7
    out.setdefault("item_export_kwargs", dict())
125 7
    if settings["FEED_EXPORT_INDENT"] is None:
126 7
        out.setdefault("indent", None)
127
    else:
128 7
        out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
129 7
    return out
130

131

132 7
def feed_process_params_from_cli(settings, output, output_format=None,
133
                                 overwrite_output=None):
134
    """
135
    Receives feed export params (from the 'crawl' or 'runspider' commands),
136
    checks for inconsistencies in their quantities and returns a dictionary
137
    suitable to be used as the FEEDS setting.
138
    """
139 7
    valid_output_formats = without_none_values(
140
        settings.getwithbase('FEED_EXPORTERS')
141
    ).keys()
142

143 7
    def check_valid_format(output_format):
144 7
        if output_format not in valid_output_formats:
145 7
            raise UsageError(
146
                f"Unrecognized output format '{output_format}'. "
147
                f"Set a supported one ({tuple(valid_output_formats)}) "
148
                "after a colon at the end of the output URI (i.e. -o/-O "
149
                "<URI>:<FORMAT>) or as a file extension."
150
            )
151

152 7
    overwrite = False
153 7
    if overwrite_output:
154 7
        if output:
155 7
            raise UsageError(
156
                "Please use only one of -o/--output and -O/--overwrite-output"
157
            )
158 7
        output = overwrite_output
159 7
        overwrite = True
160

161 7
    if output_format:
162 7
        if len(output) == 1:
163 7
            check_valid_format(output_format)
164 7
            message = (
165
                'The -t command line option is deprecated in favor of '
166
                'specifying the output format within the output URI. See the '
167
                'documentation of the -o and -O options for more information.',
168
            )
169 7
            warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
170 7
            return {output[0]: {'format': output_format}}
171
        else:
172 7
            raise UsageError(
173
                'The -t command-line option cannot be used if multiple output '
174
                'URIs are specified'
175
            )
176

177 7
    result = {}
178 7
    for element in output:
179 7
        try:
180 7
            feed_uri, feed_format = element.rsplit(':', 1)
181 7
        except ValueError:
182 7
            feed_uri = element
183 7
            feed_format = os.path.splitext(element)[1].replace('.', '')
184
        else:
185 7
            if feed_uri == '-':
186 7
                feed_uri = 'stdout:'
187 7
        check_valid_format(feed_format)
188 7
        result[feed_uri] = {'format': feed_format}
189 7
        if overwrite:
190 7
            result[feed_uri]['overwrite'] = True
191

192
    # FEEDS setting should take precedence over the matching CLI options
193 7
    result.update(settings.getdict('FEEDS'))
194

195 7
    return result

Read our documentation on viewing source code .

Loading