1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3

4 1
from __future__ import absolute_import
5 1
import shutil
6 1
import sys
7 1
import math
8 1
import datetime
9 1
import argparse
10 1
import uuid
11 1
import hashlib
12 1
import tempfile
13 1
import os
14 1
import re
15 1
import json
16 1
import stat
17 1
from git import Repo
18 1
from git import NULL_TREE
19 1
from truffleHogRegexes.regexChecks import regexes
20

21

22

23 1
def main():
24 0
    parser = argparse.ArgumentParser(description='Find secrets hidden in the depths of git.')
25 0
    parser.add_argument('--json', dest="output_json", action="store_true", help="Output in JSON")
26 0
    parser.add_argument("--regex", dest="do_regex", action="store_true", help="Enable high signal regex checks")
27 0
    parser.add_argument("--rules", dest="rules", help="Ignore default regexes and source from json list file")
28 0
    parser.add_argument("--entropy", dest="do_entropy", help="Enable entropy checks")
29 0
    parser.add_argument("--since_commit", dest="since_commit", help="Only scan from a given commit hash")
30 0
    parser.add_argument("--max_depth", dest="max_depth", help="The max commit depth to go back when searching for secrets")
31 0
    parser.add_argument("--branch", dest="branch", help="Name of the branch to be scanned")
32 0
    parser.add_argument('-i', '--include_paths', type=argparse.FileType('r'), metavar='INCLUDE_PATHS_FILE',
33
                        help='File with regular expressions (one per line), at least one of which must match a Git '
34
                             'object path in order for it to be scanned; lines starting with "#" are treated as '
35
                             'comments and are ignored. If empty or not provided (default), all Git object paths are '
36
                             'included unless otherwise excluded via the --exclude_paths option.')
37 0
    parser.add_argument('-x', '--exclude_paths', type=argparse.FileType('r'), metavar='EXCLUDE_PATHS_FILE',
38
                        help='File with regular expressions (one per line), none of which may match a Git object path '
39
                             'in order for it to be scanned; lines starting with "#" are treated as comments and are '
40
                             'ignored. If empty or not provided (default), no Git object paths are excluded unless '
41
                             'effectively excluded via the --include_paths option.')
42 0
    parser.add_argument("--repo_path", type=str, dest="repo_path", help="Path to the cloned repo. If provided, git_url will not be used")
43 0
    parser.add_argument("--cleanup", dest="cleanup", action="store_true", help="Clean up all temporary result files")
44 0
    parser.add_argument('git_url', type=str, help='URL for secret searching')
45 0
    parser.set_defaults(regex=False)
46 0
    parser.set_defaults(rules={})
47 0
    parser.set_defaults(max_depth=1000000)
48 0
    parser.set_defaults(since_commit=None)
49 0
    parser.set_defaults(entropy=True)
50 0
    parser.set_defaults(branch=None)
51 0
    parser.set_defaults(repo_path=None)
52 0
    parser.set_defaults(cleanup=False)
53 0
    args = parser.parse_args()
54 0
    rules = {}
55 0
    if args.rules:
56 0
        try:
57 0
            with open(args.rules, "r") as ruleFile:
58 0
                rules = json.loads(ruleFile.read())
59 0
                for rule in rules:
60 0
                    rules[rule] = re.compile(rules[rule])
61 0
        except (IOError, ValueError) as e:
62 0
            raise("Error reading rules file")
63 0
        for regex in dict(regexes):
64 0
            del regexes[regex]
65 0
        for regex in rules:
66 0
            regexes[regex] = rules[regex]
67 0
    do_entropy = str2bool(args.do_entropy)
68

69
    # read & compile path inclusion/exclusion patterns
70 0
    path_inclusions = []
71 0
    path_exclusions = []
72 0
    if args.include_paths:
73 0
        for pattern in set(l[:-1].lstrip() for l in args.include_paths):
74 0
            if pattern and not pattern.startswith('#'):
75 0
                path_inclusions.append(re.compile(pattern))
76 0
    if args.exclude_paths:
77 0
        for pattern in set(l[:-1].lstrip() for l in args.exclude_paths):
78 0
            if pattern and not pattern.startswith('#'):
79 0
                path_exclusions.append(re.compile(pattern))
80

81 0
    output = find_strings(args.git_url, args.since_commit, args.max_depth, args.output_json, args.do_regex, do_entropy,
82
            surpress_output=False, branch=args.branch, repo_path=args.repo_path, path_inclusions=path_inclusions, path_exclusions=path_exclusions)
83 0
    project_path = output["project_path"]
84 0
    if args.cleanup:
85 0
        clean_up(output)
86 0
    if output["foundIssues"]:
87 0
        sys.exit(1)
88
    else:
89 0
        sys.exit(0)
90

91 1
def str2bool(v):
92 0
    if v == None:
93 0
        return True
94 0
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
95 0
        return True
96 0
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
97 0
        return False
98
    else:
99 0
        raise argparse.ArgumentTypeError('Boolean value expected.')
100

101

102 1
BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
103 1
HEX_CHARS = "1234567890abcdefABCDEF"
104

105 1
def del_rw(action, name, exc):
106 0
    os.chmod(name, stat.S_IWRITE)
107 0
    os.remove(name)
108

109 1
def shannon_entropy(data, iterator):
110
    """
111
    Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html
112
    """
113 1
    if not data:
114 0
        return 0
115 1
    entropy = 0
116 1
    for x in iterator:
117 1
        p_x = float(data.count(x))/len(data)
118 1
        if p_x > 0:
119 1
            entropy += - p_x*math.log(p_x, 2)
120 1
    return entropy
121

122

123 1
def get_strings_of_set(word, char_set, threshold=20):
124 1
    count = 0
125 1
    letters = ""
126 1
    strings = []
127 1
    for char in word:
128 1
        if char in char_set:
129 1
            letters += char
130 1
            count += 1
131
        else:
132 1
            if count > threshold:
133 1
                strings.append(letters)
134 1
            letters = ""
135 1
            count = 0
136 1
    if count > threshold:
137 1
        strings.append(letters)
138 1
    return strings
139

140 1
class bcolors:
141 1
    HEADER = '\033[95m'
142 1
    OKBLUE = '\033[94m'
143 1
    OKGREEN = '\033[92m'
144 1
    WARNING = '\033[93m'
145 1
    FAIL = '\033[91m'
146 1
    ENDC = '\033[0m'
147 1
    BOLD = '\033[1m'
148 1
    UNDERLINE = '\033[4m'
149

150 1
def clone_git_repo(git_url):
151 1
    project_path = tempfile.mkdtemp()
152 1
    Repo.clone_from(git_url, project_path)
153 1
    return project_path
154

155 1
def print_results(printJson, issue):
156 1
    commit_time = issue['date']
157 1
    branch_name = issue['branch']
158 1
    prev_commit = issue['commit']
159 1
    printableDiff = issue['printDiff']
160 1
    commitHash = issue['commitHash']
161 1
    reason = issue['reason']
162 1
    path = issue['path']
163

164 1
    if printJson:
165 1
        print(json.dumps(issue, sort_keys=True))
166
    else:
167 0
        print("~~~~~~~~~~~~~~~~~~~~~")
168 0
        reason = "{}Reason: {}{}".format(bcolors.OKGREEN, reason, bcolors.ENDC)
169 0
        print(reason)
170 0
        dateStr = "{}Date: {}{}".format(bcolors.OKGREEN, commit_time, bcolors.ENDC)
171 0
        print(dateStr)
172 0
        hashStr = "{}Hash: {}{}".format(bcolors.OKGREEN, commitHash, bcolors.ENDC)
173 0
        print(hashStr)
174 0
        filePath = "{}Filepath: {}{}".format(bcolors.OKGREEN, path, bcolors.ENDC)
175 0
        print(filePath)
176

177 0
        if sys.version_info >= (3, 0):
178 0
            branchStr = "{}Branch: {}{}".format(bcolors.OKGREEN, branch_name, bcolors.ENDC)
179 0
            print(branchStr)
180 0
            commitStr = "{}Commit: {}{}".format(bcolors.OKGREEN, prev_commit, bcolors.ENDC)
181 0
            print(commitStr)
182 0
            print(printableDiff)
183
        else:
184 0
            branchStr = "{}Branch: {}{}".format(bcolors.OKGREEN, branch_name.encode('utf-8'), bcolors.ENDC)
185 0
            print(branchStr)
186 0
            commitStr = "{}Commit: {}{}".format(bcolors.OKGREEN, prev_commit.encode('utf-8'), bcolors.ENDC)
187 0
            print(commitStr)
188 0
            print(printableDiff.encode('utf-8'))
189 0
        print("~~~~~~~~~~~~~~~~~~~~~")
190

191 1
def find_entropy(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash):
192 1
    stringsFound = []
193 1
    lines = printableDiff.split("\n")
194 1
    for line in lines:
195 1
        for word in line.split():
196 1
            base64_strings = get_strings_of_set(word, BASE64_CHARS)
197 1
            hex_strings = get_strings_of_set(word, HEX_CHARS)
198 1
            for string in base64_strings:
199 1
                b64Entropy = shannon_entropy(string, BASE64_CHARS)
200 1
                if b64Entropy > 4.5:
201 1
                    stringsFound.append(string)
202 1
                    printableDiff = printableDiff.replace(string, bcolors.WARNING + string + bcolors.ENDC)
203 1
            for string in hex_strings:
204 1
                hexEntropy = shannon_entropy(string, HEX_CHARS)
205 1
                if hexEntropy > 3:
206 1
                    stringsFound.append(string)
207 1
                    printableDiff = printableDiff.replace(string, bcolors.WARNING + string + bcolors.ENDC)
208 1
    entropicDiff = None
209 1
    if len(stringsFound) > 0:
210 1
        entropicDiff = {}
211 1
        entropicDiff['date'] = commit_time
212 1
        entropicDiff['path'] = blob.b_path if blob.b_path else blob.a_path
213 1
        entropicDiff['branch'] = branch_name
214 1
        entropicDiff['commit'] = prev_commit.message
215 1
        entropicDiff['diff'] = blob.diff.decode('utf-8', errors='replace')
216 1
        entropicDiff['stringsFound'] = stringsFound
217 1
        entropicDiff['printDiff'] = printableDiff
218 1
        entropicDiff['commitHash'] = prev_commit.hexsha
219 1
        entropicDiff['reason'] = "High Entropy"
220 1
    return entropicDiff
221

222 1
def regex_check(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash, custom_regexes={}):
223 0
    if custom_regexes:
224 0
        secret_regexes = custom_regexes
225
    else:
226 0
        secret_regexes = regexes
227 0
    regex_matches = []
228 0
    for key in secret_regexes:
229 0
        found_strings = secret_regexes[key].findall(printableDiff)
230 0
        for found_string in found_strings:
231 0
            found_diff = printableDiff.replace(printableDiff, bcolors.WARNING + found_string + bcolors.ENDC)
232 0
        if found_strings:
233 0
            foundRegex = {}
234 0
            foundRegex['date'] = commit_time
235 0
            foundRegex['path'] = blob.b_path if blob.b_path else blob.a_path
236 0
            foundRegex['branch'] = branch_name
237 0
            foundRegex['commit'] = prev_commit.message
238 0
            foundRegex['diff'] = blob.diff.decode('utf-8', errors='replace')
239 0
            foundRegex['stringsFound'] = found_strings
240 0
            foundRegex['printDiff'] = found_diff
241 0
            foundRegex['reason'] = key
242 0
            foundRegex['commitHash'] = prev_commit.hexsha
243 0
            regex_matches.append(foundRegex)
244 0
    return regex_matches
245

246 1
def diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions):
247 1
    issues = []
248 1
    for blob in diff:
249 1
        printableDiff = blob.diff.decode('utf-8', errors='replace')
250 1
        if printableDiff.startswith("Binary files"):
251 0
            continue
252 1
        if not path_included(blob, path_inclusions, path_exclusions):
253 0
            continue
254 1
        commit_time =  datetime.datetime.fromtimestamp(prev_commit.committed_date).strftime('%Y-%m-%d %H:%M:%S')
255 1
        foundIssues = []
256 1
        if do_entropy:
257 1
            entropicDiff = find_entropy(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash)
258 1
            if entropicDiff:
259 1
                foundIssues.append(entropicDiff)
260 1
        if do_regex:
261 0
            found_regexes = regex_check(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash, custom_regexes)
262 0
            foundIssues += found_regexes
263 1
        if not surpress_output:
264 1
            for foundIssue in foundIssues:
265 1
                print_results(printJson, foundIssue)
266 1
        issues += foundIssues
267 1
    return issues
268

269 1
def handle_results(output, output_dir, foundIssues):
270 1
    for foundIssue in foundIssues:
271 1
        result_path = os.path.join(output_dir, str(uuid.uuid4()))
272 1
        with open(result_path, "w+") as result_file:
273 1
            result_file.write(json.dumps(foundIssue))
274 1
        output["foundIssues"].append(result_path)
275 1
    return output
276

277 1
def path_included(blob, include_patterns=None, exclude_patterns=None):
278
    """Check if the diff blob object should included in analysis.
279

280
    If defined and non-empty, `include_patterns` has precedence over `exclude_patterns`, such that a blob that is not
281
    matched by any of the defined `include_patterns` will be excluded, even when it is not matched by any of the defined
282
    `exclude_patterns`. If either `include_patterns` or `exclude_patterns` are undefined or empty, they will have no
283
    effect, respectively. All blobs are included by this function when called with default arguments.
284

285
    :param blob: a Git diff blob object
286
    :param include_patterns: iterable of compiled regular expression objects; when non-empty, at least one pattern must
287
     match the blob object for it to be included; if empty or None, all blobs are included, unless excluded via
288
     `exclude_patterns`
289
    :param exclude_patterns: iterable of compiled regular expression objects; when non-empty, _none_ of the patterns may
290
     match the blob object for it to be included; if empty or None, no blobs are excluded if not otherwise
291
     excluded via `include_patterns`
292
    :return: False if the blob is _not_ matched by `include_patterns` (when provided) or if it is matched by
293
    `exclude_patterns` (when provided), otherwise returns True
294
    """
295 1
    path = blob.b_path if blob.b_path else blob.a_path
296 1
    if include_patterns and not any(p.match(path) for p in include_patterns):
297 1
        return False
298 1
    if exclude_patterns and any(p.match(path) for p in exclude_patterns):
299 1
        return False
300 1
    return True
301

302

303 1
def find_strings(git_url, since_commit=None, max_depth=1000000, printJson=False, do_regex=False, do_entropy=True, surpress_output=True,
304
                custom_regexes={}, branch=None, repo_path=None, path_inclusions=None, path_exclusions=None):
305 1
    output = {"foundIssues": []}
306 1
    if repo_path:
307 1
        project_path = repo_path
308
    else:
309 1
        project_path = clone_git_repo(git_url)
310 1
    repo = Repo(project_path)
311 1
    already_searched = set()
312 1
    output_dir = tempfile.mkdtemp()
313

314 1
    if branch:
315 1
        branches = repo.remotes.origin.fetch(branch)
316
    else:
317 1
        branches = repo.remotes.origin.fetch()
318

319 1
    for remote_branch in branches:
320 1
        since_commit_reached = False
321 1
        branch_name = remote_branch.name
322 1
        prev_commit = None
323 1
        for curr_commit in repo.iter_commits(branch_name, max_count=max_depth):
324 1
            commitHash = curr_commit.hexsha
325 1
            if commitHash == since_commit:
326 1
                since_commit_reached = True
327 1
            if since_commit and since_commit_reached:
328 1
                prev_commit = curr_commit
329 1
                continue
330
            # if not prev_commit, then curr_commit is the newest commit. And we have nothing to diff with.
331
            # But we will diff the first commit with NULL_TREE here to check the oldest code.
332
            # In this way, no commit will be missed.
333 1
            diff_hash = hashlib.md5((str(prev_commit) + str(curr_commit)).encode('utf-8')).digest()
334 1
            if not prev_commit:
335 1
                prev_commit = curr_commit
336 1
                continue
337 1
            elif diff_hash in already_searched:
338 1
                prev_commit = curr_commit
339 1
                continue
340
            else:
341 1
                diff = prev_commit.diff(curr_commit, create_patch=True)
342
            # avoid searching the same diffs
343 1
            already_searched.add(diff_hash)
344 1
            foundIssues = diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions)
345 1
            output = handle_results(output, output_dir, foundIssues)
346 1
            prev_commit = curr_commit
347
        # Handling the first commit
348 1
        diff = curr_commit.diff(NULL_TREE, create_patch=True)
349 1
        foundIssues = diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions)
350 1
        output = handle_results(output, output_dir, foundIssues)
351 1
    output["project_path"] = project_path
352 1
    output["clone_uri"] = git_url
353 1
    output["issues_path"] = output_dir
354 1
    if not repo_path:
355 1
        shutil.rmtree(project_path, onerror=del_rw)
356 1
    return output
357

358 1
def clean_up(output):
359 0
    print("Whhaat")
360 0
    issues_path = output.get("issues_path", None)
361 0
    if issues_path and os.path.isdir(issues_path):
362 0
        shutil.rmtree(output["issues_path"])
363

364 1
if __name__ == "__main__":
365 0
    main()

Read our documentation on viewing source code .

Loading