1 6
import re
2 6
from urllib.parse import urlparse
3

4 6
REPLACE_STR = '$encrypted$'
5

6

7 6
class UriCleaner(object):
8 6
    REPLACE_STR = REPLACE_STR
9
    # https://regex101.com/r/sV2dO2/2
10 6
    SENSITIVE_URI_PATTERN = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s('
11
                                       r')<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s('
12
                                       r')<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))',
13
                                       re.MULTILINE)
14

15 6
    @staticmethod
16
    def remove_sensitive(cleartext):
17 0
        redactedtext = cleartext
18 0
        text_index = 0
19 0
        while True:
20 0
            match = UriCleaner.SENSITIVE_URI_PATTERN.search(redactedtext, text_index)
21 0
            if not match:
22 0
                break
23 0
            o = urlparse.urlsplit(match.group(1))
24 0
            if not o.username and not o.password:
25 0
                if o.netloc and ":" in o.netloc:
26
                    # Handle the special case url http://username:password that can appear in url
27 0
                    (username, password) = o.netloc.split(':')
28
                else:
29 0
                    text_index += len(match.group(1))
30 0
                    continue
31
            else:
32 0
                username = o.username
33 0
                password = o.password
34

35
            # Given a python MatchObject, with respect to redactedtext, find and
36
            # replace the first occurance of username and the first and second
37
            # occurance of password
38

39 0
            uri_str = redactedtext[match.start():match.end()]
40 0
            if username:
41 0
                uri_str = uri_str.replace(username, UriCleaner.REPLACE_STR, 1)
42
            # 2, just in case the password is $encrypted$
43 0
            if password:
44 0
                uri_str = uri_str.replace(password, UriCleaner.REPLACE_STR, 2)
45

46 0
            t = redactedtext[:match.start()] + uri_str
47 0
            text_index = len(t)
48 0
            if (match.end() < len(redactedtext)):
49 0
                t += redactedtext[match.end():]
50

51 0
            redactedtext = t
52 0
            if text_index >= len(redactedtext):
53 0
                text_index = len(redactedtext) - 1
54

55 0
        return redactedtext
56

57

58 6
class PlainTextCleaner(object):
59 6
    REPLACE_STR = REPLACE_STR
60

61 6
    @staticmethod
62
    def remove_sensitive(cleartext, sensitive):
63 0
        if sensitive == '':
64 0
            return cleartext
65 0
        return re.sub(r'%s' % re.escape(sensitive), '$encrypted$', cleartext)

Read our documentation on viewing source code .

Loading