-
Notifications
You must be signed in to change notification settings - Fork 14
/
vulnerability_fix_engine.py
503 lines (422 loc) · 18.8 KB
/
vulnerability_fix_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
import asyncio
import json
import logging
import os
import re
import shutil
import string
from collections import Counter
from dataclasses import dataclass, asdict, field
from random import random
from typing import List, Optional, Dict, Generator
import aiofiles
import github
import time
import github_util
git_hub = github_util.load_github()
github_util.print_current_rate_limit()
class ShallowUpdateNotAllowedException(Exception):
pass
class CouldNotReadFromRemoteRepositoryException(Exception):
pass
class CLRFReplacementException(Exception):
pass
class PullRequestAlreadyExistsException(Exception):
pass
class ForkAlreadyExistsException(Exception):
pass
class AmbiguousObjectNameHeadException(Exception):
pass
async def subprocess_run(args: List[str], cwd: str) -> Optional[str]:
proc = await asyncio.create_subprocess_exec(
args[0],
*args[1:],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=cwd
)
stdout, stderr = await proc.communicate()
print(f'[{args!r} exited with {proc.returncode}]')
if stdout:
print(f'[stdout]\n{stdout.decode()}')
if proc.returncode != 0:
if stderr:
msg = stderr.decode()
error_msg = f'[stderr]\n{msg}'
if 'timeout' in msg:
raise TimeoutError(error_msg)
if 'shallow update not allowed' in msg:
raise ShallowUpdateNotAllowedException(error_msg)
if 'Could not read from remote repository' in msg:
raise CouldNotReadFromRemoteRepositoryException(error_msg)
if 'A pull request already exists' in msg:
raise PullRequestAlreadyExistsException(error_msg)
if 'Error creating fork' in msg and 'already exists on github.com' in msg:
raise ForkAlreadyExistsException(error_msg)
if ' Ambiguous object name: \'HEAD\'' in msg:
raise AmbiguousObjectNameHeadException(error_msg)
raise RuntimeError(error_msg)
else:
if stderr:
msg = stderr.decode()
error_msg = f'[stderr]\n{msg}'
if 'warning: CRLF will be replaced by LF' in msg:
raise CLRFReplacementException(stderr)
print(error_msg)
if stdout:
return stdout.decode()
else:
return None
@dataclass
class VulnerabilityFixModule:
branch_name: str
clone_repos_location: str
data_base_dir: str
save_point_location: str
pr_message_file_absolute_path: str
commit_message: str
_cached_vulnerable_projects: List['VulnerableProjectFiles'] = field(default_factory=list)
def clean_previous_run(self):
# Cleanup method to get rid of previous files
logging.info('Begining Cleanup')
if os.path.isdir(self.clone_repos_location):
shutil.rmtree(self.clone_repos_location)
os.mkdir(self.clone_repos_location)
if not os.path.isdir(self.save_point_location):
os.mkdir(self.save_point_location)
logging.info('Cleanup Complete')
def _list_all_json_files(self) -> Generator[str, None, None]:
directory = os.fsencode(self.data_base_dir)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.startswith('g__') and filename.endswith('.json'):
yield self.data_base_dir + '/' + filename
def should_accept_project(self, project_name: str) -> bool:
return False
@staticmethod
def _read_repository_and_file_names(json_file_name: str) -> 'VulnerableProjectFiles':
with open(json_file_name) as jsonFile:
data = json.load(jsonFile)
project_name: str = data['project']['name']
# Counter is a Dict[file name, count] representation
files = Counter([obj[0]['file'] for obj in data['data']])
return VulnerableProjectFiles(project_name, files)
def _load_vulnerable_projects(self) -> List['VulnerableProjectFiles']:
vulnerable_projects: List[VulnerableProjectFiles] = []
for json_file in self._list_all_json_files():
vulnerable = self._read_repository_and_file_names(json_file)
if not self.should_accept_project(vulnerable.project_name):
continue
vulnerable.print()
vulnerable_projects.append(vulnerable)
return vulnerable_projects
def get_vulnerable_project_files(self) -> List['VulnerableProjectFiles']:
if len(self._cached_vulnerable_projects) == 0:
self._cached_vulnerable_projects = self._load_vulnerable_projects()
return self._cached_vulnerable_projects
def save_point_file_name(self, project_files: 'VulnerableProjectFiles') -> str:
project_as_file_name = project_files.project_name.replace('/', '__')
return f'{self.save_point_location}/g__{project_as_file_name}.json'
async def do_fix_vulnerable_file(self, project_name: str, file: str, expected_fix_count: int) -> int:
"""
Fixes the vulnerabilities in the file passed.
:param project_name: The name of the project being fixed.
:param file: The file to fix the vulnerabilities in.
:param expected_fix_count: The expected number of vulnerabilities to be fixed.
:return: The actual number of vulnerabilities fixed.
"""
pass
@dataclass(frozen=True)
class VulnerabilityFixReport:
files_fixed: int
vulnerabilities_fixed: int
file_name_fixed: List[str]
@dataclass
class VulnerableProjectFiles:
project_name: str
files: Dict[str, int]
def print(self):
print(self.project_name)
for file in self.files:
print('\t', '/' + file + ': ' + str(self.files[file]))
@dataclass
class VulnerabilityFixerEngine:
fix_module: VulnerabilityFixModule
project_files: VulnerableProjectFiles
def _project_name(self):
return self.project_files.project_name
def project_file_name(self) -> str:
return self.fix_module.clone_repos_location + '/' + self._project_name()
def save_point_file_name(self) -> str:
return self.fix_module.save_point_file_name(self.project_files)
@staticmethod
async def do_resilient_hub_call(args: List[str], cwd: str, lock=None) -> Optional[str]:
"""
Make a call to hub that is resilient to timeout exceptions.
:return: stdout output if successful
"""
async def do_call(wait_time, previous_wait_time=0) -> Optional[str]:
try:
if lock is not None:
async with lock:
# GitHub documentation says to wait 1 second between writes
# https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-abuse-rate-limits
await asyncio.sleep(1)
return await subprocess_run(args, cwd=cwd)
else:
return await subprocess_run(args, cwd=cwd)
except TimeoutError as e:
if wait_time > 70:
raise TimeoutError(f'Gave up after waiting {previous_wait_time} seconds') from e
# This serves a double purpose as informational and also a 'sane'
# way to slow down this script reasonably
github_util.print_current_rate_limit()
await asyncio.sleep(wait_time)
return await do_call(wait_time * 2 + random(), previous_wait_time=wait_time)
return await do_call(1)
async def do_clone(self):
# Deal with fskobjects https://stackoverflow.com/a/41029655/3708426
await self.do_resilient_hub_call(
[
'hub',
'clone',
self._project_name(),
self._project_name(), # This is the directory to clone into
'--config',
'transfer.fsckobjects=false',
'--config',
'receive.fsckobjects=false',
'--config',
'fetch.fsckobjects=false'
],
cwd=self.fix_module.clone_repos_location
)
async def do_run_in(self, args: List[str]) -> Optional[str]:
assert args[0] != 'hub', 'This method is unsuitable for calling `hub`. Use `do_run_hub_in` instead!'
return await subprocess_run(args, cwd=self.project_file_name())
async def do_run_hub_in(self, args: List[str], lock) -> Optional[str]:
return await self.do_resilient_hub_call(args=args, cwd=self.project_file_name(), lock=lock)
async def do_fix_vulnerable_file(self, file: str, expected_fix_count: int) -> int:
file_being_fixed: str = self.project_file_name() + file
# Sanity check, verify the file still exists, the data may be out of date
if not os.path.exists(file_being_fixed):
logging.warning(
'Fix for `%s` in file `%s` can not be applied as file does not exist!',
self._project_name(),
file
)
return 0
return await self.fix_module.do_fix_vulnerable_file(
self._project_name(),
file_being_fixed,
expected_fix_count
)
def submodule_files(self) -> List[str]:
"""
List all of the git submodule files in this project.
We're not going to be fixing pom files in Git submodules so this allows us to filter them out.
"""
files: List[str] = []
submodule_file_path: str = self.project_file_name() + '/.gitmodules'
if not os.path.isfile(submodule_file_path):
return []
with open(submodule_file_path) as submodule_file:
for line in submodule_file:
if 'path' in line:
files.append('/' + line.split('= ')[1][0:-1])
return files
async def do_fix_vulnerabilities(self) -> VulnerabilityFixReport:
project_vulnerabilities_fixed = 0
project_files_fixed = 0
submodules = self.submodule_files()
files_fixed: List[str] = []
for file in self.project_files.files:
# Skip submodule files
skip = next((True for submodule in submodules if file.startswith(submodule)), False)
if not skip:
file_vulnerabilities_fixed = await self.do_fix_vulnerable_file(file, self.project_files.files[file])
if file_vulnerabilities_fixed > 0:
project_vulnerabilities_fixed += file_vulnerabilities_fixed
project_files_fixed += 1
files_fixed.append(file)
return VulnerabilityFixReport(
project_files_fixed,
project_vulnerabilities_fixed,
files_fixed
)
async def do_create_branch(self):
await self.do_run_in(['git', 'checkout', '-b', self.fix_module.branch_name])
async def do_stage_changes(self, project_report: VulnerabilityFixReport):
command = ['git', 'add']
# Only run add on the files we've modified
# This hopefully limits CRLF changes
files_trimmed = [file_name.lstrip('/') for file_name in project_report.file_name_fixed]
command.extend(files_trimmed)
await self.do_run_in(command)
async def do_commit_changes(self):
msg = self.fix_module.commit_message
await self.do_run_in(['git', 'commit', '-m', msg])
async def do_fork_repository(self, lock, index: int = 0):
org_name = 'BulkSecurityGeneratorProject'
if index == 0:
use_org_name = org_name
else:
use_org_name = f'{org_name}{index}'
try:
await self.do_run_hub_in(
[
'hub',
'fork',
'--remote-name',
'origin',
'--org',
use_org_name
],
lock
)
except ForkAlreadyExistsException as e:
if index >= 46:
raise e
else:
return await self.do_fork_repository(lock, index + 1)
async def do_push_changes(self, retry_count: int = 5):
try:
# Don't use '--force-with-lease' here, it doesn't work. Trust me.
await self.do_run_in(['git', 'push', 'origin', self.fix_module.branch_name, '--force'])
except ShallowUpdateNotAllowedException:
# A shallow update isn't allowed against this repo (I must have forked it before)
await self.do_run_in(['git', 'fetch', '--unshallow'])
# Now re-run the push
# Don't use '--force-with-lease' here, it doesn't work. Trust me.
await self.do_run_in(['git', 'push', 'origin', self.fix_module.branch_name, '--force'])
except CouldNotReadFromRemoteRepositoryException as e:
logging.warning(f'Could not read from remote repository {5 - retry_count}/5')
if retry_count <= 0:
raise e
else:
# Forking is an async operation, so we may need to wait a bit for it
await asyncio.sleep((5 - retry_count) * 2 + random())
await self.do_push_changes(retry_count - 1)
async def do_create_pull_request(self, lock) -> str:
try:
stdout = await self.do_run_hub_in(
['hub', 'pull-request', '-p', '--file', self.fix_module.pr_message_file_absolute_path],
lock
)
pattern = re.compile(r'(https://.*)')
match = pattern.search(stdout)
return match.group(1)
except PullRequestAlreadyExistsException:
return 'ALREADY_EXISTS'
async def do_create_save_point(self, report: VulnerabilityFixReport, pr_url: str):
json_body = {
'project_name': self.project_files.project_name,
'files': self.project_files.files,
'pull_request': pr_url,
'report': asdict(report)
}
async with aiofiles.open(self.save_point_file_name(), 'w') as json_file_to_write:
await json_file_to_write.write(json.dumps(json_body, indent=4))
async def execute_vulnerability_fixer_engine(engine: VulnerabilityFixerEngine, lock) -> VulnerabilityFixReport:
engine.project_files.print()
await engine.do_clone()
project_report: VulnerabilityFixReport = await engine.do_fix_vulnerabilities()
pr_url = ''
# If the LGTM data is out-of-date, there can be cases where no vulnerabilities are fixed
if project_report.vulnerabilities_fixed != 0:
await engine.do_create_branch()
await engine.do_stage_changes(project_report)
await engine.do_commit_changes()
if not engine.project_files.project_name.lower().startswith('jlleitschuh'):
await engine.do_fork_repository(lock)
await engine.do_push_changes()
pr_url = await engine.do_create_pull_request(lock)
await engine.do_create_save_point(project_report, pr_url)
return project_report
async def execute_vulnerability_fixer_engine_checked(
engine: VulnerabilityFixerEngine,
lock
) -> Optional[VulnerabilityFixReport]:
try:
return await execute_vulnerability_fixer_engine(engine, lock)
except AmbiguousObjectNameHeadException:
# They named their main branch 'HEAD'... Why?! No fix for them.
return None
except BaseException as e:
if 'CancelledError' in e.__class__.__name__:
raise e
logging.error(
f'Failed while processing project `{engine.project_files.project_name}`. Exception type: {type(e)}.\n{e!s}')
raise e
def is_archived_git_hub_repository(project: VulnerableProjectFiles) -> bool:
try:
return git_hub.get_repo(project.project_name).archived
except github.UnknownObjectException:
# The repository was removed, so treat it as the same
return True
class EnginesExecutionException(Exception):
pass
async def _do_execute_engines(engines: List[VulnerabilityFixerEngine]):
github_hub_lock = asyncio.Lock()
waiting_reports = []
try:
for engine in engines:
waiting_reports.append(
execute_vulnerability_fixer_engine_checked(engine, github_hub_lock)
)
projects_fixed = 0
files_fixed = 0
vulnerabilities_fixed = 0
print(f'Processing {len(waiting_reports)} Projects:')
all_reports = await asyncio.gather(*waiting_reports)
for report in all_reports:
if report is None:
continue
if report.vulnerabilities_fixed > 0:
projects_fixed += 1
files_fixed += report.files_fixed
vulnerabilities_fixed += report.vulnerabilities_fixed
print('Done!')
print(f'Fixed {vulnerabilities_fixed} vulnerabilities in {files_fixed} files across {projects_fixed} projects!')
except Exception as e:
raise EnginesExecutionException('Engine execution failed!') from e
async def _do_execute_fix_module(fix_module: VulnerabilityFixModule, starting_letter: str):
fix_module.clean_previous_run()
vulnerable_projects = fix_module.get_vulnerable_project_files()
print()
print(f'Loading Async Project Executions for {len(vulnerable_projects)} Projects:')
engines = []
for vulnerable_project in vulnerable_projects:
if not vulnerable_project.project_name.startswith(starting_letter):
continue
# Check this first, it's going to be faster
if os.path.exists(fix_module.save_point_file_name(vulnerable_project)):
logging.info(f'Skipping project {vulnerable_project.project_name} since save point file already exists')
continue
# Check this second, it's going to be slower
if is_archived_git_hub_repository(vulnerable_project):
logging.info(f'Skipping project {vulnerable_project.project_name} since it is archived')
continue
print(f'Loading Execution for: {vulnerable_project.project_name}')
engine = VulnerabilityFixerEngine(
fix_module=fix_module,
project_files=vulnerable_project
)
engines.append(engine)
# Break the engine list into sub-lists of size 100
size = 100
engine_lists = x = [engines[i:i + size] for i in range(0, len(engines), size)]
for engine_list in engine_lists:
await _do_execute_engines(engine_list)
# try:
# await _do_execute_engines(engine_list)
# except EnginesExecutionException as e:
# logging.exception(f'Failed while processing engine group. {str(e)}')
def do_execute_fix_module(fix_module: VulnerabilityFixModule):
start = time.monotonic()
for char in string.ascii_letters + string.digits:
asyncio.run(_do_execute_fix_module(fix_module, starting_letter=char))
end = time.monotonic()
duration_seconds = end - start
print(f'Execution took {duration_seconds} seconds')
github_util.print_current_rate_limit()