scrapy / scrapy
1 7
import struct
2 7
from gzip import GzipFile
3 7
from io import BytesIO
4

5 7
from scrapy.utils.decorators import deprecated
6

7

8
# - GzipFile's read() has issues returning leftover uncompressed data when
9
#   input is corrupted
10
# - read1(), which fetches data before raising EOFError on next call
11
#   works here
12 7
@deprecated('GzipFile.read1')
13 7
def read1(gzf, size=-1):
14 0
    return gzf.read1(size)
15

16

17 7
def gunzip(data):
18
    """Gunzip the given data and return as much data as possible.
19

20
    This is resilient to CRC checksum errors.
21
    """
22 7
    f = GzipFile(fileobj=BytesIO(data))
23 7
    output_list = []
24 7
    chunk = b'.'
25 7
    while chunk:
26 7
        try:
27 7
            chunk = f.read1(8196)
28 7
            output_list.append(chunk)
29 7
        except (IOError, EOFError, struct.error):
30
            # complete only if there is some data, otherwise re-raise
31
            # see issue 87 about catching struct.error
32
            # some pages are quite small so output_list is empty and f.extrabuf
33
            # contains the whole page content
34 7
            if output_list or getattr(f, 'extrabuf', None):
35 7
                try:
36 7
                    output_list.append(f.extrabuf[-f.extrasize:])
37
                finally:
38 7
                    break
39
            else:
40 7
                raise
41 7
    return b''.join(output_list)
42

43

44 7
def gzip_magic_number(response):
45 7
    return response.body[:3] == b'\x1f\x8b\x08'

Read our documentation on viewing source code .

Loading