1
|
|
#!/usr/bin/env python
|
2
|
|
# -*- coding: utf-8 -*-
|
3
|
|
|
4
|
1
|
from __future__ import absolute_import
|
5
|
1
|
import shutil
|
6
|
1
|
import sys
|
7
|
1
|
import math
|
8
|
1
|
import datetime
|
9
|
1
|
import argparse
|
10
|
1
|
import uuid
|
11
|
1
|
import hashlib
|
12
|
1
|
import tempfile
|
13
|
1
|
import os
|
14
|
1
|
import re
|
15
|
1
|
import json
|
16
|
1
|
import stat
|
17
|
1
|
from git import Repo
|
18
|
1
|
from git import NULL_TREE
|
19
|
1
|
from truffleHogRegexes.regexChecks import regexes
|
20
|
|
|
21
|
|
|
22
|
|
|
23
|
1
|
def main():
|
24
|
0
|
parser = argparse.ArgumentParser(description='Find secrets hidden in the depths of git.')
|
25
|
0
|
parser.add_argument('--json', dest="output_json", action="store_true", help="Output in JSON")
|
26
|
0
|
parser.add_argument("--regex", dest="do_regex", action="store_true", help="Enable high signal regex checks")
|
27
|
0
|
parser.add_argument("--rules", dest="rules", help="Ignore default regexes and source from json list file")
|
28
|
0
|
parser.add_argument("--entropy", dest="do_entropy", help="Enable entropy checks")
|
29
|
0
|
parser.add_argument("--since_commit", dest="since_commit", help="Only scan from a given commit hash")
|
30
|
0
|
parser.add_argument("--max_depth", dest="max_depth", help="The max commit depth to go back when searching for secrets")
|
31
|
0
|
parser.add_argument("--branch", dest="branch", help="Name of the branch to be scanned")
|
32
|
0
|
parser.add_argument('-i', '--include_paths', type=argparse.FileType('r'), metavar='INCLUDE_PATHS_FILE',
|
33
|
|
help='File with regular expressions (one per line), at least one of which must match a Git '
|
34
|
|
'object path in order for it to be scanned; lines starting with "#" are treated as '
|
35
|
|
'comments and are ignored. If empty or not provided (default), all Git object paths are '
|
36
|
|
'included unless otherwise excluded via the --exclude_paths option.')
|
37
|
0
|
parser.add_argument('-x', '--exclude_paths', type=argparse.FileType('r'), metavar='EXCLUDE_PATHS_FILE',
|
38
|
|
help='File with regular expressions (one per line), none of which may match a Git object path '
|
39
|
|
'in order for it to be scanned; lines starting with "#" are treated as comments and are '
|
40
|
|
'ignored. If empty or not provided (default), no Git object paths are excluded unless '
|
41
|
|
'effectively excluded via the --include_paths option.')
|
42
|
0
|
parser.add_argument("--repo_path", type=str, dest="repo_path", help="Path to the cloned repo. If provided, git_url will not be used")
|
43
|
0
|
parser.add_argument("--cleanup", dest="cleanup", action="store_true", help="Clean up all temporary result files")
|
44
|
0
|
parser.add_argument('git_url', type=str, help='URL for secret searching')
|
45
|
0
|
parser.set_defaults(regex=False)
|
46
|
0
|
parser.set_defaults(rules={})
|
47
|
0
|
parser.set_defaults(max_depth=1000000)
|
48
|
0
|
parser.set_defaults(since_commit=None)
|
49
|
0
|
parser.set_defaults(entropy=True)
|
50
|
0
|
parser.set_defaults(branch=None)
|
51
|
0
|
parser.set_defaults(repo_path=None)
|
52
|
0
|
parser.set_defaults(cleanup=False)
|
53
|
0
|
args = parser.parse_args()
|
54
|
0
|
rules = {}
|
55
|
0
|
if args.rules:
|
56
|
0
|
try:
|
57
|
0
|
with open(args.rules, "r") as ruleFile:
|
58
|
0
|
rules = json.loads(ruleFile.read())
|
59
|
0
|
for rule in rules:
|
60
|
0
|
rules[rule] = re.compile(rules[rule])
|
61
|
0
|
except (IOError, ValueError) as e:
|
62
|
0
|
raise("Error reading rules file")
|
63
|
0
|
for regex in dict(regexes):
|
64
|
0
|
del regexes[regex]
|
65
|
0
|
for regex in rules:
|
66
|
0
|
regexes[regex] = rules[regex]
|
67
|
0
|
do_entropy = str2bool(args.do_entropy)
|
68
|
|
|
69
|
|
# read & compile path inclusion/exclusion patterns
|
70
|
0
|
path_inclusions = []
|
71
|
0
|
path_exclusions = []
|
72
|
0
|
if args.include_paths:
|
73
|
0
|
for pattern in set(l[:-1].lstrip() for l in args.include_paths):
|
74
|
0
|
if pattern and not pattern.startswith('#'):
|
75
|
0
|
path_inclusions.append(re.compile(pattern))
|
76
|
0
|
if args.exclude_paths:
|
77
|
0
|
for pattern in set(l[:-1].lstrip() for l in args.exclude_paths):
|
78
|
0
|
if pattern and not pattern.startswith('#'):
|
79
|
0
|
path_exclusions.append(re.compile(pattern))
|
80
|
|
|
81
|
0
|
output = find_strings(args.git_url, args.since_commit, args.max_depth, args.output_json, args.do_regex, do_entropy,
|
82
|
|
surpress_output=False, branch=args.branch, repo_path=args.repo_path, path_inclusions=path_inclusions, path_exclusions=path_exclusions)
|
83
|
0
|
project_path = output["project_path"]
|
84
|
0
|
if args.cleanup:
|
85
|
0
|
clean_up(output)
|
86
|
0
|
if output["foundIssues"]:
|
87
|
0
|
sys.exit(1)
|
88
|
|
else:
|
89
|
0
|
sys.exit(0)
|
90
|
|
|
91
|
1
|
def str2bool(v):
|
92
|
0
|
if v == None:
|
93
|
0
|
return True
|
94
|
0
|
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
95
|
0
|
return True
|
96
|
0
|
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
97
|
0
|
return False
|
98
|
|
else:
|
99
|
0
|
raise argparse.ArgumentTypeError('Boolean value expected.')
|
100
|
|
|
101
|
|
|
102
|
1
|
BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
|
103
|
1
|
HEX_CHARS = "1234567890abcdefABCDEF"
|
104
|
|
|
105
|
1
|
def del_rw(action, name, exc):
|
106
|
0
|
os.chmod(name, stat.S_IWRITE)
|
107
|
0
|
os.remove(name)
|
108
|
|
|
109
|
1
|
def shannon_entropy(data, iterator):
|
110
|
|
"""
|
111
|
|
Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html
|
112
|
|
"""
|
113
|
1
|
if not data:
|
114
|
0
|
return 0
|
115
|
1
|
entropy = 0
|
116
|
1
|
for x in iterator:
|
117
|
1
|
p_x = float(data.count(x))/len(data)
|
118
|
1
|
if p_x > 0:
|
119
|
1
|
entropy += - p_x*math.log(p_x, 2)
|
120
|
1
|
return entropy
|
121
|
|
|
122
|
|
|
123
|
1
|
def get_strings_of_set(word, char_set, threshold=20):
|
124
|
1
|
count = 0
|
125
|
1
|
letters = ""
|
126
|
1
|
strings = []
|
127
|
1
|
for char in word:
|
128
|
1
|
if char in char_set:
|
129
|
1
|
letters += char
|
130
|
1
|
count += 1
|
131
|
|
else:
|
132
|
1
|
if count > threshold:
|
133
|
1
|
strings.append(letters)
|
134
|
1
|
letters = ""
|
135
|
1
|
count = 0
|
136
|
1
|
if count > threshold:
|
137
|
1
|
strings.append(letters)
|
138
|
1
|
return strings
|
139
|
|
|
140
|
1
|
class bcolors:
|
141
|
1
|
HEADER = '\033[95m'
|
142
|
1
|
OKBLUE = '\033[94m'
|
143
|
1
|
OKGREEN = '\033[92m'
|
144
|
1
|
WARNING = '\033[93m'
|
145
|
1
|
FAIL = '\033[91m'
|
146
|
1
|
ENDC = '\033[0m'
|
147
|
1
|
BOLD = '\033[1m'
|
148
|
1
|
UNDERLINE = '\033[4m'
|
149
|
|
|
150
|
1
|
def clone_git_repo(git_url):
|
151
|
1
|
project_path = tempfile.mkdtemp()
|
152
|
1
|
Repo.clone_from(git_url, project_path)
|
153
|
1
|
return project_path
|
154
|
|
|
155
|
1
|
def print_results(printJson, issue):
|
156
|
1
|
commit_time = issue['date']
|
157
|
1
|
branch_name = issue['branch']
|
158
|
1
|
prev_commit = issue['commit']
|
159
|
1
|
printableDiff = issue['printDiff']
|
160
|
1
|
commitHash = issue['commitHash']
|
161
|
1
|
reason = issue['reason']
|
162
|
1
|
path = issue['path']
|
163
|
|
|
164
|
1
|
if printJson:
|
165
|
1
|
print(json.dumps(issue, sort_keys=True))
|
166
|
|
else:
|
167
|
0
|
print("~~~~~~~~~~~~~~~~~~~~~")
|
168
|
0
|
reason = "{}Reason: {}{}".format(bcolors.OKGREEN, reason, bcolors.ENDC)
|
169
|
0
|
print(reason)
|
170
|
0
|
dateStr = "{}Date: {}{}".format(bcolors.OKGREEN, commit_time, bcolors.ENDC)
|
171
|
0
|
print(dateStr)
|
172
|
0
|
hashStr = "{}Hash: {}{}".format(bcolors.OKGREEN, commitHash, bcolors.ENDC)
|
173
|
0
|
print(hashStr)
|
174
|
0
|
filePath = "{}Filepath: {}{}".format(bcolors.OKGREEN, path, bcolors.ENDC)
|
175
|
0
|
print(filePath)
|
176
|
|
|
177
|
0
|
if sys.version_info >= (3, 0):
|
178
|
0
|
branchStr = "{}Branch: {}{}".format(bcolors.OKGREEN, branch_name, bcolors.ENDC)
|
179
|
0
|
print(branchStr)
|
180
|
0
|
commitStr = "{}Commit: {}{}".format(bcolors.OKGREEN, prev_commit, bcolors.ENDC)
|
181
|
0
|
print(commitStr)
|
182
|
0
|
print(printableDiff)
|
183
|
|
else:
|
184
|
0
|
branchStr = "{}Branch: {}{}".format(bcolors.OKGREEN, branch_name.encode('utf-8'), bcolors.ENDC)
|
185
|
0
|
print(branchStr)
|
186
|
0
|
commitStr = "{}Commit: {}{}".format(bcolors.OKGREEN, prev_commit.encode('utf-8'), bcolors.ENDC)
|
187
|
0
|
print(commitStr)
|
188
|
0
|
print(printableDiff.encode('utf-8'))
|
189
|
0
|
print("~~~~~~~~~~~~~~~~~~~~~")
|
190
|
|
|
191
|
1
|
def find_entropy(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash):
|
192
|
1
|
stringsFound = []
|
193
|
1
|
lines = printableDiff.split("\n")
|
194
|
1
|
for line in lines:
|
195
|
1
|
for word in line.split():
|
196
|
1
|
base64_strings = get_strings_of_set(word, BASE64_CHARS)
|
197
|
1
|
hex_strings = get_strings_of_set(word, HEX_CHARS)
|
198
|
1
|
for string in base64_strings:
|
199
|
1
|
b64Entropy = shannon_entropy(string, BASE64_CHARS)
|
200
|
1
|
if b64Entropy > 4.5:
|
201
|
1
|
stringsFound.append(string)
|
202
|
1
|
printableDiff = printableDiff.replace(string, bcolors.WARNING + string + bcolors.ENDC)
|
203
|
1
|
for string in hex_strings:
|
204
|
1
|
hexEntropy = shannon_entropy(string, HEX_CHARS)
|
205
|
1
|
if hexEntropy > 3:
|
206
|
1
|
stringsFound.append(string)
|
207
|
1
|
printableDiff = printableDiff.replace(string, bcolors.WARNING + string + bcolors.ENDC)
|
208
|
1
|
entropicDiff = None
|
209
|
1
|
if len(stringsFound) > 0:
|
210
|
1
|
entropicDiff = {}
|
211
|
1
|
entropicDiff['date'] = commit_time
|
212
|
1
|
entropicDiff['path'] = blob.b_path if blob.b_path else blob.a_path
|
213
|
1
|
entropicDiff['branch'] = branch_name
|
214
|
1
|
entropicDiff['commit'] = prev_commit.message
|
215
|
1
|
entropicDiff['diff'] = blob.diff.decode('utf-8', errors='replace')
|
216
|
1
|
entropicDiff['stringsFound'] = stringsFound
|
217
|
1
|
entropicDiff['printDiff'] = printableDiff
|
218
|
1
|
entropicDiff['commitHash'] = prev_commit.hexsha
|
219
|
1
|
entropicDiff['reason'] = "High Entropy"
|
220
|
1
|
return entropicDiff
|
221
|
|
|
222
|
1
|
def regex_check(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash, custom_regexes={}):
|
223
|
0
|
if custom_regexes:
|
224
|
0
|
secret_regexes = custom_regexes
|
225
|
|
else:
|
226
|
0
|
secret_regexes = regexes
|
227
|
0
|
regex_matches = []
|
228
|
0
|
for key in secret_regexes:
|
229
|
0
|
found_strings = secret_regexes[key].findall(printableDiff)
|
230
|
0
|
for found_string in found_strings:
|
231
|
0
|
found_diff = printableDiff.replace(printableDiff, bcolors.WARNING + found_string + bcolors.ENDC)
|
232
|
0
|
if found_strings:
|
233
|
0
|
foundRegex = {}
|
234
|
0
|
foundRegex['date'] = commit_time
|
235
|
0
|
foundRegex['path'] = blob.b_path if blob.b_path else blob.a_path
|
236
|
0
|
foundRegex['branch'] = branch_name
|
237
|
0
|
foundRegex['commit'] = prev_commit.message
|
238
|
0
|
foundRegex['diff'] = blob.diff.decode('utf-8', errors='replace')
|
239
|
0
|
foundRegex['stringsFound'] = found_strings
|
240
|
0
|
foundRegex['printDiff'] = found_diff
|
241
|
0
|
foundRegex['reason'] = key
|
242
|
0
|
foundRegex['commitHash'] = prev_commit.hexsha
|
243
|
0
|
regex_matches.append(foundRegex)
|
244
|
0
|
return regex_matches
|
245
|
|
|
246
|
1
|
def diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions):
|
247
|
1
|
issues = []
|
248
|
1
|
for blob in diff:
|
249
|
1
|
printableDiff = blob.diff.decode('utf-8', errors='replace')
|
250
|
1
|
if printableDiff.startswith("Binary files"):
|
251
|
0
|
continue
|
252
|
1
|
if not path_included(blob, path_inclusions, path_exclusions):
|
253
|
0
|
continue
|
254
|
1
|
commit_time = datetime.datetime.fromtimestamp(prev_commit.committed_date).strftime('%Y-%m-%d %H:%M:%S')
|
255
|
1
|
foundIssues = []
|
256
|
1
|
if do_entropy:
|
257
|
1
|
entropicDiff = find_entropy(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash)
|
258
|
1
|
if entropicDiff:
|
259
|
1
|
foundIssues.append(entropicDiff)
|
260
|
1
|
if do_regex:
|
261
|
0
|
found_regexes = regex_check(printableDiff, commit_time, branch_name, prev_commit, blob, commitHash, custom_regexes)
|
262
|
0
|
foundIssues += found_regexes
|
263
|
1
|
if not surpress_output:
|
264
|
1
|
for foundIssue in foundIssues:
|
265
|
1
|
print_results(printJson, foundIssue)
|
266
|
1
|
issues += foundIssues
|
267
|
1
|
return issues
|
268
|
|
|
269
|
1
|
def handle_results(output, output_dir, foundIssues):
|
270
|
1
|
for foundIssue in foundIssues:
|
271
|
1
|
result_path = os.path.join(output_dir, str(uuid.uuid4()))
|
272
|
1
|
with open(result_path, "w+") as result_file:
|
273
|
1
|
result_file.write(json.dumps(foundIssue))
|
274
|
1
|
output["foundIssues"].append(result_path)
|
275
|
1
|
return output
|
276
|
|
|
277
|
1
|
def path_included(blob, include_patterns=None, exclude_patterns=None):
|
278
|
|
"""Check if the diff blob object should included in analysis.
|
279
|
|
|
280
|
|
If defined and non-empty, `include_patterns` has precedence over `exclude_patterns`, such that a blob that is not
|
281
|
|
matched by any of the defined `include_patterns` will be excluded, even when it is not matched by any of the defined
|
282
|
|
`exclude_patterns`. If either `include_patterns` or `exclude_patterns` are undefined or empty, they will have no
|
283
|
|
effect, respectively. All blobs are included by this function when called with default arguments.
|
284
|
|
|
285
|
|
:param blob: a Git diff blob object
|
286
|
|
:param include_patterns: iterable of compiled regular expression objects; when non-empty, at least one pattern must
|
287
|
|
match the blob object for it to be included; if empty or None, all blobs are included, unless excluded via
|
288
|
|
`exclude_patterns`
|
289
|
|
:param exclude_patterns: iterable of compiled regular expression objects; when non-empty, _none_ of the patterns may
|
290
|
|
match the blob object for it to be included; if empty or None, no blobs are excluded if not otherwise
|
291
|
|
excluded via `include_patterns`
|
292
|
|
:return: False if the blob is _not_ matched by `include_patterns` (when provided) or if it is matched by
|
293
|
|
`exclude_patterns` (when provided), otherwise returns True
|
294
|
|
"""
|
295
|
1
|
path = blob.b_path if blob.b_path else blob.a_path
|
296
|
1
|
if include_patterns and not any(p.match(path) for p in include_patterns):
|
297
|
1
|
return False
|
298
|
1
|
if exclude_patterns and any(p.match(path) for p in exclude_patterns):
|
299
|
1
|
return False
|
300
|
1
|
return True
|
301
|
|
|
302
|
|
|
303
|
1
|
def find_strings(git_url, since_commit=None, max_depth=1000000, printJson=False, do_regex=False, do_entropy=True, surpress_output=True,
|
304
|
|
custom_regexes={}, branch=None, repo_path=None, path_inclusions=None, path_exclusions=None):
|
305
|
1
|
output = {"foundIssues": []}
|
306
|
1
|
if repo_path:
|
307
|
1
|
project_path = repo_path
|
308
|
|
else:
|
309
|
1
|
project_path = clone_git_repo(git_url)
|
310
|
1
|
repo = Repo(project_path)
|
311
|
1
|
already_searched = set()
|
312
|
1
|
output_dir = tempfile.mkdtemp()
|
313
|
|
|
314
|
1
|
if branch:
|
315
|
1
|
branches = repo.remotes.origin.fetch(branch)
|
316
|
|
else:
|
317
|
1
|
branches = repo.remotes.origin.fetch()
|
318
|
|
|
319
|
1
|
for remote_branch in branches:
|
320
|
1
|
since_commit_reached = False
|
321
|
1
|
branch_name = remote_branch.name
|
322
|
1
|
prev_commit = None
|
323
|
1
|
for curr_commit in repo.iter_commits(branch_name, max_count=max_depth):
|
324
|
1
|
commitHash = curr_commit.hexsha
|
325
|
1
|
if commitHash == since_commit:
|
326
|
1
|
since_commit_reached = True
|
327
|
1
|
if since_commit and since_commit_reached:
|
328
|
1
|
prev_commit = curr_commit
|
329
|
1
|
continue
|
330
|
|
# if not prev_commit, then curr_commit is the newest commit. And we have nothing to diff with.
|
331
|
|
# But we will diff the first commit with NULL_TREE here to check the oldest code.
|
332
|
|
# In this way, no commit will be missed.
|
333
|
1
|
diff_hash = hashlib.md5((str(prev_commit) + str(curr_commit)).encode('utf-8')).digest()
|
334
|
1
|
if not prev_commit:
|
335
|
1
|
prev_commit = curr_commit
|
336
|
1
|
continue
|
337
|
1
|
elif diff_hash in already_searched:
|
338
|
1
|
prev_commit = curr_commit
|
339
|
1
|
continue
|
340
|
|
else:
|
341
|
1
|
diff = prev_commit.diff(curr_commit, create_patch=True)
|
342
|
|
# avoid searching the same diffs
|
343
|
1
|
already_searched.add(diff_hash)
|
344
|
1
|
foundIssues = diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions)
|
345
|
1
|
output = handle_results(output, output_dir, foundIssues)
|
346
|
1
|
prev_commit = curr_commit
|
347
|
|
# Handling the first commit
|
348
|
1
|
diff = curr_commit.diff(NULL_TREE, create_patch=True)
|
349
|
1
|
foundIssues = diff_worker(diff, curr_commit, prev_commit, branch_name, commitHash, custom_regexes, do_entropy, do_regex, printJson, surpress_output, path_inclusions, path_exclusions)
|
350
|
1
|
output = handle_results(output, output_dir, foundIssues)
|
351
|
1
|
output["project_path"] = project_path
|
352
|
1
|
output["clone_uri"] = git_url
|
353
|
1
|
output["issues_path"] = output_dir
|
354
|
1
|
if not repo_path:
|
355
|
1
|
shutil.rmtree(project_path, onerror=del_rw)
|
356
|
1
|
return output
|
357
|
|
|
358
|
1
|
def clean_up(output):
|
359
|
0
|
print("Whhaat")
|
360
|
0
|
issues_path = output.get("issues_path", None)
|
361
|
0
|
if issues_path and os.path.isdir(issues_path):
|
362
|
0
|
shutil.rmtree(output["issues_path"])
|
363
|
|
|
364
|
1
|
if __name__ == "__main__":
|
365
|
0
|
main()
|