PromyLOPh / crocoite

Compare 94be61a ... +12 ... 71af236

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -57,9 +57,10 @@
Loading
57 57
        fd.seek (0)
58 58
        for it in ArchiveIterator (fd):
59 59
            headers = it.rec_headers
60 -
            assert headers['warc-type'] == 'resource'
61 -
            assert headers['warc-target-uri'].endswith (':log')
62 -
            assert headers['content-type'] == f'text/plain; encoding={handler.logEncoding}'
60 +
            assert headers['warc-type'] == 'metadata'
61 +
            assert 'warc-target-uri' not in headers
62 +
            assert headers['x-crocoite-type'] == 'log'
63 +
            assert headers['content-type'] == f'application/json; encoding={handler.logEncoding}'
63 64
64 65
            while True:
65 66
                l = it.raw_stream.readline ()
@@ -108,7 +109,8 @@
Loading
108 109
109 110
                headers = rec.rec_headers
110 111
                assert headers['warc-type'] == 'warcinfo'
111 -
                assert headers['warc-target-uri'].endswith (':warcinfo')
112 +
                assert 'warc-target-uri' not in headers
113 +
                assert 'x-crocoite-type' not in headers
112 114
113 115
                data = json.load (rec.raw_stream)
114 116
                assert data == g.payload
@@ -119,11 +121,14 @@
Loading
119 121
                rec = next (it)
120 122
121 123
                headers = rec.rec_headers
122 -
                assert headers['warc-type'] == 'metadata'
124 +
                assert headers['warc-type'] == 'resource'
125 +
                assert headers['content-type'] == 'application/javascript; charset=utf-8'
126 +
                assert headers['x-crocoite-type'] == 'script'
123 127
                checkWarcinfoId (headers)
124 -
                path = g.path or '-'
125 -
                goldenpath = f':script/{urllib.parse.quote (path)}'
126 -
                assert headers['warc-target-uri'].endswith (goldenpath), (g.path, path, goldenpath)
128 +
                if g.path:
129 +
                    assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath)
130 +
                else:
131 +
                    assert 'warc-target-uri' not in headers
127 132
128 133
                data = rec.raw_stream.read ().decode ('utf-8')
129 134
                assert data == g.data
@@ -133,6 +138,7 @@
Loading
133 138
134 139
                headers = rec.rec_headers
135 140
                assert headers['warc-type'] == 'conversion'
141 +
                assert headers['x-crocoite-type'] == 'screenshot'
136 142
                checkWarcinfoId (headers)
137 143
                assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url)
138 144
                assert headers['warc-refers-to'] is None
@@ -144,10 +150,10 @@
Loading
144 150
145 151
                headers = rec.rec_headers
146 152
                assert headers['warc-type'] == 'conversion'
153 +
                assert headers['x-crocoite-type'] == 'dom-snapshot'
147 154
                checkWarcinfoId (headers)
148 155
                assert URL (headers['warc-target-uri']) == g.url
149 156
                assert headers['warc-refers-to'] is None
150 -
                assert headers['X-DOM-Snapshot'] == 'True'
151 157
152 158
                assert rec.raw_stream.read () == g.document
153 159
            elif isinstance (g, RequestResponsePair):
@@ -156,6 +162,7 @@
Loading
156 162
                # request
157 163
                headers = rec.rec_headers
158 164
                assert headers['warc-type'] == 'request'
165 +
                assert 'x-crocoite-type' not in headers
159 166
                checkWarcinfoId (headers)
160 167
                assert URL (headers['warc-target-uri']) == g.url
161 168
                assert headers['x-chrome-request-id'] == g.id
@@ -164,7 +171,6 @@
Loading
164 171
                if g.request.hasPostData:
165 172
                    if g.request.body is not None:
166 173
                        assert rec.raw_stream.read () == g.request.body
167 -
                        assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body)
168 174
                    else:
169 175
                        # body fetch failed
170 176
                        assert headers['warc-truncated'] == 'unspecified'
@@ -181,6 +187,7 @@
Loading
181 187
                    checkWarcinfoId (headers)
182 188
                    assert URL (headers['warc-target-uri']) == g.url
183 189
                    assert headers['x-chrome-request-id'] == g.id
190 +
                    assert 'x-crocoite-type' not in headers
184 191
185 192
                    # these are checked separately
186 193
                    filteredHeaders = CIMultiDict (httpheaders.headers)
@@ -197,7 +204,6 @@
Loading
197 204
198 205
                    if g.response.body is not None:
199 206
                        assert rec.raw_stream.read () == g.response.body
200 -
                        assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body))
201 207
                        assert httpheaders['content-length'] == str (len (g.response.body))
202 208
                        # body is never truncated if it exists
203 209
                        assert headers['warc-truncated'] is None

@@ -22,7 +22,7 @@
Loading
22 22
Controller classes, handling actions required for archival
23 23
"""
24 24
25 -
import time, tempfile, asyncio, json, os
25 +
import time, tempfile, asyncio, json, os, shutil
26 26
from itertools import islice
27 27
from datetime import datetime
28 28
from operator import attrgetter
@@ -355,6 +355,10 @@
Loading
355 355
    def __call__ (self, urls):
356 356
        return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls))
357 357
358 +
def hasTemplate (s):
359 +
    """ Return True if string s has string templates """
360 +
    return '{' in s and '}' in s
361 +
358 362
class RecursiveController:
359 363
    """
360 364
    Simple recursive controller
@@ -363,31 +367,36 @@
Loading
363 367
    """
364 368
365 369
    __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have',
366 -
            'pending', 'stats', 'prefix', 'tempdir', 'running', 'concurrency')
370 +
            'pending', 'stats', 'tempdir', 'running', 'concurrency',
371 +
            'copyLock')
367 372
368 373
    SCHEME_WHITELIST = {'http', 'https'}
369 374
370 -
    def __init__ (self, url, output, command, logger, prefix='{host}-{date}-',
375 +
    def __init__ (self, url, output, command, logger,
371 376
            tempdir=None, policy=DepthLimit (0), concurrency=1):
372 377
        self.url = url
373 378
        self.output = output
374 379
        self.command = command
375 -
        self.prefix = prefix
376 380
        self.logger = logger.bind (context=type(self).__name__, seedurl=url)
377 381
        self.policy = policy
378 382
        self.tempdir = tempdir
383 +
        # A lock if only a single output file (no template) is requested
384 +
        self.copyLock = None if hasTemplate (output) else asyncio.Lock ()
385 +
        # some sanity checks. XXX move to argparse?
386 +
        if self.copyLock and os.path.exists (self.output):
387 +
                raise ValueError ('Output file exists')
379 388
        # tasks currently running
380 389
        self.running = set ()
381 390
        # max number of tasks running
382 391
        self.concurrency = concurrency
383 392
        # keep in sync with StatsHandler
384 393
        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}
385 394
386 -
    async def fetch (self, entry):
395 +
    async def fetch (self, entry, seqnum):
387 396
        """
388 397
        Fetch a single URL using an external command
389 398
390 -
        command is usually crocoite-grab
399 +
        command is usually crocoite-single
391 400
        """
392 401
393 402
        assert isinstance (entry, SetEntry)
@@ -403,8 +412,9 @@
Loading
403 412
            else:
404 413
                return e.format (url=url, dest=dest.name)
405 414
406 -
        def formatPrefix (p):
407 -
            return p.format (host=url.host, date=datetime.utcnow ().isoformat ())
415 +
        def formatOutput (p):
416 +
            return p.format (host=url.host,
417 +
                    date=datetime.utcnow ().isoformat (), seqnum=seqnum)
408 418
409 419
        def logStats ():
410 420
            logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
@@ -417,14 +427,15 @@
Loading
417 427
            return
418 428
419 429
        dest = tempfile.NamedTemporaryFile (dir=self.tempdir,
420 -
                prefix=formatPrefix (self.prefix), suffix='.warc.gz',
421 -
                delete=False)
422 -
        destpath = os.path.join (self.output, os.path.basename (dest.name))
430 +
                prefix=__package__, suffix='.warc.gz', delete=False)
423 431
        command = list (map (formatCommand, self.command))
424 -
        logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath)
432 +
        logger.info ('fetch', uuid='d1288fbe-8bae-42c8-af8c-f2fa8b41794f',
433 +
                command=command)
425 434
        try:
426 -
            process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE,
427 -
                    stderr=asyncio.subprocess.DEVNULL, stdin=asyncio.subprocess.DEVNULL,
435 +
            process = await asyncio.create_subprocess_exec (*command,
436 +
                    stdout=asyncio.subprocess.PIPE,
437 +
                    stderr=asyncio.subprocess.DEVNULL,
438 +
                    stdin=asyncio.subprocess.DEVNULL,
428 439
                    start_new_session=True, limit=100*1024*1024)
429 440
            while True:
430 441
                data = await process.stdout.readline ()
@@ -449,8 +460,33 @@
Loading
449 460
        finally:
450 461
            code = await process.wait()
451 462
            if code == 0:
452 -
                # atomically move once finished
453 -
                os.rename (dest.name, destpath)
463 +
                if self.copyLock is None:
464 +
                    # atomically move once finished
465 +
                    lastDestpath = None
466 +
                    while True:
467 +
                        # XXX: must generate a new name every time, otherwise
468 +
                        # this loop never terminates
469 +
                        destpath = formatOutput (self.output)
470 +
                        assert destpath != lastDestpath
471 +
                        lastDestpath = destpath
472 +
473 +
                        # python does not have rename(…, …, RENAME_NOREPLACE),
474 +
                        # but this is safe nontheless, since we’re
475 +
                        # single-threaded
476 +
                        if not os.path.exists (destpath):
477 +
                            # create the directory, so templates like
478 +
                            # /{host}/{date}/… are possible
479 +
                            os.makedirs (os.path.dirname (destpath), exist_ok=True)
480 +
                            os.rename (dest.name, destpath)
481 +
                            break
482 +
                else:
483 +
                    # atomically (in the context of this process) append to
484 +
                    # existing file
485 +
                    async with self.copyLock:
486 +
                        with open (dest.name, 'rb') as infd, \
487 +
                                open (self.output, 'ab') as outfd:
488 +
                            shutil.copyfileobj (infd, outfd)
489 +
                        os.unlink (dest.name)
454 490
            else:
455 491
                self.stats['crashed'] += 1
456 492
                logStats ()
@@ -464,6 +500,7 @@
Loading
464 500
                    have=len (self.have)-len(self.running),
465 501
                    running=len (self.running))
466 502
503 +
        seqnum = 1
467 504
        try:
468 505
            self.have = set ()
469 506
            self.pending = set ([SetEntry (self.url, depth=0)])
@@ -472,8 +509,9 @@
Loading
472 509
                # since pending is a set this picks a random item, which is fine
473 510
                u = self.pending.pop ()
474 511
                self.have.add (u)
475 -
                t = asyncio.ensure_future (self.fetch (u))
512 +
                t = asyncio.ensure_future (self.fetch (u, seqnum))
476 513
                self.running.add (t)
514 +
                seqnum += 1
477 515
478 516
                log ()
479 517

@@ -52,18 +52,24 @@
Loading
52 52
    """ A JavaScript resource """
53 53
54 54
    __slots__ = ('path', 'data')
55 +
    datadir = 'data'
55 56
56 57
    def __init__ (self, path=None, encoding='utf-8'):
57 58
        self.path = path
58 59
        if path:
59 -
            self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
60 +
            self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
60 61
61 62
    def __repr__ (self):
62 63
        return f'<Script {self.path}>'
63 64
64 65
    def __str__ (self):
65 66
        return self.data
66 67
68 +
    @property
69 +
    def abspath (self):
70 +
        return pkg_resources.resource_filename (__name__,
71 +
                os.path.join (self.datadir, self.path))
72 +
67 73
    @classmethod
68 74
    def fromStr (cls, data, path=None):
69 75
        s = Script ()
@@ -140,7 +146,7 @@
Loading
140 146
        constructor = result['objectId']
141 147
142 148
        if self.options:
143 -
            yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}/options')
149 +
            yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
144 150
        result = await tab.Runtime.callFunctionOn (
145 151
                functionDeclaration='function(options){return new this(options);}',
146 152
                objectId=constructor,
@@ -253,9 +259,12 @@
Loading
253 259
            if url in haveUrls:
254 260
                # ignore duplicate URLs. they are usually caused by
255 261
                # javascript-injected iframes (advertising) with no(?) src
256 -
                self.logger.warning (f'have DOM snapshot for URL {url}, ignoring')
262 +
                self.logger.warning ('dom snapshot duplicate',
263 +
                        uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
257 264
            elif url.scheme in ('http', 'https'):
258 -
                self.logger.debug (f'saving DOM snapshot for url {url}, base {doc["baseURL"]}')
265 +
                self.logger.debug ('dom snapshot',
266 +
                        uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
267 +
                        base=doc["baseURL"])
259 268
                haveUrls.add (url)
260 269
                walker = ChromeTreeWalker (doc)
261 270
                # remove script, to make the page static and noscript, because at the

@@ -26,6 +26,8 @@
Loading
26 26
from datetime import datetime
27 27
import hashlib, pkg_resources
28 28
29 +
from yarl import URL
30 +
29 31
class StrJsonEncoder (json.JSONEncoder):
30 32
    """ JSON encoder that turns unknown classes into a string and thus never
31 33
    fails """
@@ -39,12 +41,6 @@
Loading
39 41
        except TypeError:
40 42
            return str (obj)
41 43
42 -
def packageUrl (path):
43 -
    """
44 -
    Create URL for package data stored into WARC
45 -
    """
46 -
    return 'urn:' + __package__ + ':' + urllib.parse.quote (path)
47 -
48 44
async def getFormattedViewportMetrics (tab):
49 45
    layoutMetrics = await tab.Page.getLayoutMetrics ()
50 46
    # XXX: I’m not entirely sure which one we should use here

@@ -22,7 +22,7 @@
Loading
22 22
IRC bot “chromebot”
23 23
"""
24 24
25 -
import asyncio, argparse, json, tempfile, time, random
25 +
import asyncio, argparse, json, tempfile, time, random, os
26 26
from datetime import datetime
27 27
from urllib.parse import urlsplit
28 28
from enum import IntEnum, unique
@@ -366,7 +366,7 @@
Loading
366 366
367 367
    async def onMessage (self, nick, target, message, **kwargs):
368 368
        """ Message received """
369 -
        if target in self.channels and message.startswith (self.nick):
369 +
        if target in self.channels and message.startswith (self.nick + ':'):
370 370
            user = self.users[target].get (nick, User (nick))
371 371
            reply = ReplyContext (client=self, target=target, user=user)
372 372
@@ -500,17 +500,21 @@
Loading
500 500
                'recursive': args.recursive,
501 501
                'concurrency': args.concurrency,
502 502
                }}
503 -
        grabCmd = ['crocoite-grab']
503 +
        grabCmd = ['crocoite-single']
504 504
        grabCmd.extend (['--warcinfo',
505 505
                '!' + json.dumps (warcinfo, cls=StrJsonEncoder)])
506 506
        if args.insecure:
507 507
            grabCmd.append ('--insecure')
508 508
        grabCmd.extend (['{url}', '{dest}'])
509 509
        # prefix warcinfo with !, so it won’t get expanded
510 -
        cmdline = ['crocoite-recursive', args.url, '--tempdir', self.tempdir,
511 -
                '--prefix', j.id + '-{host}-{date}-', '--policy',
512 -
                args.recursive, '--concurrency', str (args.concurrency),
513 -
                self.destdir, '--'] + grabCmd
510 +
        cmdline = ['crocoite',
511 +
                '--tempdir', self.tempdir,
512 +
                '--recursion', args.recursive,
513 +
                '--concurrency', str (args.concurrency),
514 +
                args.url,
515 +
                os.path.join (self.destdir,
516 +
                        j.id + '-{host}-{date}-{seqnum}.warc.gz'),
517 +
                '--'] + grabCmd
514 518
515 519
        strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
516 520
        reply (f'{args.url} has been queued as {j.id} with {strargs}')
@@ -627,7 +631,11 @@
Loading
627 631
        if not buf:
628 632
            return
629 633
630 -
        data = json.loads (buf)
634 +
        try:
635 +
            data = json.loads (buf)
636 +
        except json.decoder.JSONDecodeError:
637 +
            # ignore invalid
638 +
            return
631 639
        msgid = data['uuid']
632 640
633 641
        if msgid in self.ignoreMsgid:
@@ -640,9 +648,8 @@
Loading
640 648
        elif msgid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687':
641 649
            nesteddata = data['data']
642 650
            nestedmsgid = nesteddata['uuid']
643 -
            if nestedmsgid == '1680f384-744c-4b8a-815b-7346e632e8db':
651 +
            if nestedmsgid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f':
644 652
                del nesteddata['command']
645 -
                del nesteddata['destfile']
646 653
            
647 654
        buf = json.dumps (data)
648 655
        for c in self.clients:

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Click to load this diff.
Loading diff...

Learn more Showing 2 files with coverage changes found.

Changes in crocoite/devtools.py
-3
+2
+1
Loading file...
Changes in crocoite/browser.py
-1
+1
Loading file...
Files Coverage
crocoite -0.71% 76.36%
Project Totals (20 files) 76.36%
Loading