You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
trezor-firmware/crypto/fuzzer/extract_fuzzer_dictionary.py

400 lines
14 KiB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This experimental program is designed to extract a subset of interesting test
case snippets from the trezor-crypto test directory and output them as a
standard fuzzer dictionary file.
The program is built on quick-and-dirty regex matching that is known to be
incorrect for parsing code files, but is considered "good enough" for this
specific purpose.
Note that there are target-specific configurations and internal filter settings.
"""
import argparse
import binascii
import glob
import re
# re2 is considered for future use
# it requires a system installation and the google-re2 python package
# import re2
# Expected target format for strings in code:
# Most strings are defined in the general form "example"
# There are a few test vectors in crypto/tests/wycheproof/javascript/EcUtil.js
# with 'example' style string definitions, these are ignored for now
TARGET_DIR = "../tests"
# intentionally excluded file types that currently do not provide enough value:
# *.js, *.md, *.sh, *.html and others from the wycheproof subdirectory
targeted_filetypes_multiline_classA = ("*.c", "*.h", "*.py")
# Java files have different multiline strings that are handled differently
targeted_filetypes_multiline_classB = ("*.java",)
targeted_filetypes_multiline = (
targeted_filetypes_multiline_classA + targeted_filetypes_multiline_classB
)
# files without multiline string content
# Note: consider switching to actual JSON parsing?
# Note: the wycheproof repository has a number of test cases for other
# cryptography such as DSA and RSA which are currently less interesting for the
# fuzzer dictionary and therefore excluded
targeted_filetypes_singleline = (
"aes*.json",
"ecdh*.json",
"ecdsa*.json",
"x25519*.json",
"chacha20*.json",
"kw*.json",
)
verbose = False
# patterns to extract
# singleline:
# "4a1e76f133afb"
# 0xAF8BBDFE8CDD5 and 0x0488b21e
# m/0'/2147483647'/1'/2147483646'/2' in test_check.c via m/[\d'/]+
#
# multiline:
# "fffc" \n "99"
# "dpubZ9169K" \n "bTYbcY"
# "\x65\xf9" \\n "\xa0\x6a"
# { 0x086d8bd5, 0x1018f82f, \n 0xc55ece} , see rg "0x([a-zA-Z0-9])+"
# patterns to ignore
# lines with print statements
# lines with exceptions
# comments and other metadata in the testvector JSON files
# filenames
# import statements and other package names
# patterns to investigate further
# public keys with the form BEGIN PUBLIC KEY
# TODO "abc" + "def" string concatenation on the same line without newline
# strings in comments
# dictionary text export file format
# general description:
# https://github.com/AFLplusplus/AFLplusplus/blob/stable/dictionaries/README.md
#
# the exported file is primarly designed for use with a recent libFuzzer version
# and is known to be partially incompatible with other fuzzers that impose
# other limitations
#
# known incompatibilities:
# * honggfuzz only reads a limited number of dictionary entries (8192 with version 2.5)
# * afl++ only reads line content with up to 128 byte
# match everything in quotes that doesn't have an internal quote character and
# at least one internal character
regex_string_general_definition = r"\"[^\"]+\""
regex_string_general = re.compile(regex_string_general_definition)
# the capturing group ignores prefix and suffix outside of the quotes
# Note that this is prone to matching the last line of a C-style multiline string,
# which is addressed via extra state handling during the file processing
regex_oneline_string = re.compile(
r"(" + regex_string_general_definition + r")\s*[\,\)]+"
)
# ignore lines that have a "+" character preceding a string
regex_oneline_string_java_ignore1 = re.compile(r"^\s*\+\s*\"")
regex_hex_character_segment_inner_definition = "[0-9a-fA-F]+"
regex_hex_character_input_complete = re.compile(
'^"' + regex_hex_character_segment_inner_definition + '"$'
)
regex_hex_character_input_inner = re.compile(
regex_hex_character_segment_inner_definition
)
# most constants are preceded by a space, but some have a "(" "[" or "{" before them
regex_hex_constant_singleline = re.compile(r"(?<=\(|\[|\{| )0x[a-fA-F0-9]+")
regex_c_style_multiline = re.compile(r"(?:\".+\"\s*\n\s*)+(?:\".+\")", re.MULTILINE)
regex_c_intermediary_content = re.compile(r"\"\s*\n\s*\"", re.MULTILINE)
# TODO how to prevent matching in the middle of a multi-line string concatenation?
# negative lookbehind for "+" is not possible generically and
# (?<!\+ ) and similar patterns are too static
regex_java_style_multiline = re.compile(
r"(?:\".+\"\s*\n\s*\+\s*)+(?:\".+\")", re.MULTILINE
)
regex_java_intermediary_content = re.compile(r"\"\s*\n\s*\+\s*\"", re.MULTILINE)
regex_text_newline = re.compile(r"\\n")
# primitive regex that catches most filenames in the data set
regex_filename_heuristic = re.compile(r"\.[a-zA-Z]+")
counter_hex_content = 0
counter_wycheproof_hex_reconstruction = 0
# TODO add '"curve"' to capture algorithm names?
allowlist_keywords_json = (
'"uncompressed"',
'"wx"',
'"wy"',
'"msg"',
'"sig"',
'"key"',
'"iv"',
'"ct"',
'"aad"',
'"tag"',
'"public"',
'"private"',
'"shared"',
'"padding"',
'"x"',
'"d"',
)
# TODO the "keyPem" entry is only a workaround for an encoding issue
ignore_keywords_java = (
"println(",
"Exception(",
'"keyPem"',
)
ignore_keywords_c = ("printf(",)
def ignore_single_line_json(data):
"""return True if the input should be ignored"""
# ignore everything that is not matched by the allowlist
for keyword in allowlist_keywords_json:
if data.find(keyword) > -1:
return False
return True
def ignore_single_line_java(data):
"""return True if the input should be ignored"""
for keyword in ignore_keywords_java:
if data.find(keyword) > -1:
return True
return False
def ignore_single_line_c(data):
"""return True if the input should be ignored"""
for keyword in ignore_keywords_c:
if data.find(keyword) > -1:
return True
return False
def ignore_general(data):
"""return True if the input should be ignored"""
if regex_filename_heuristic.search(data):
return True
return False
def encode_strings_for_dictionary(data):
"""
Assumes that inputs are already in string quotes
Handles dictionary-specific encoding steps
"""
# libfuzzer does not like "\n" string patterns in dictionary files, replace
# it with an encoded newline
data = regex_text_newline.sub("\\\\x0a", data)
return data
def detect_and_convert_hex(data):
"""
Convert hex strings
Directly pass through non-hex content
"""
global counter_hex_content
global counter_wycheproof_hex_reconstruction
match_result1 = regex_hex_character_input_complete.search(data)
if match_result1:
match_result2 = regex_hex_character_input_inner.search(match_result1.string)
isolated_substring = match_result2.group(0)
if len(isolated_substring) % 2 == 1:
# Note: the test cases in the wycheproof testvector JSON files have
# a custom binary hex format to represent keys
# among other things, this results in hex strings with an uneven
# number of characters
# see tests/wycheproof/java/com/google/security/wycheproof/JsonUtil.java
# specifically the asBigInteger() function for more information
if isolated_substring[0] >= "0" and isolated_substring[0] <= "7":
isolated_substring = "0" + isolated_substring
else:
isolated_substring = "f" + isolated_substring
counter_wycheproof_hex_reconstruction += 1
converted_result = ""
try:
# test error-free conversion to binary
binascii.unhexlify(isolated_substring)
hex_with_c_style_formatting = ""
pos = 0
while pos < len(isolated_substring) - 1:
hex_with_c_style_formatting += "\\x" + isolated_substring[pos : pos + 2]
pos += 2
converted_result = '"%s"' % hex_with_c_style_formatting
# TODO binascii.Incomplete exception also relevant?
except binascii.Error:
# default to the original input
return data
counter_hex_content += 1
return converted_result
return data
def search_files_recursively(directory, filetype_glob):
"""returns glob search results"""
target_files = []
print_verbose("searching in %s" % directory)
for filetype in filetype_glob:
print_verbose("searching for %s" % filetype)
target_files.extend(glob.glob(f"{directory}/**/{filetype}", recursive=True))
return target_files
def print_verbose(text):
"""print wrapper"""
if verbose:
print(text)
def recursive_dictionary_extraction(directory):
"""handle the central extraction logic"""
# TODO split this function up into subfunctions
global counter_hex_content
# handle as a set structure to de-duplicate results automatically
candidate_lines = set()
target_files = search_files_recursively(directory, targeted_filetypes_singleline)
for filepath in target_files:
per_file_result_counter = 0
with open(filepath) as _file:
print_verbose("processing %s" % filepath)
for _, line in enumerate(_file.readlines()):
if ignore_single_line_json(line):
continue
results = regex_oneline_string.findall(line)
for result in results:
candidate_lines.add(result)
per_file_result_counter += 1
if per_file_result_counter > 0:
print_verbose("results: %d" % per_file_result_counter)
print_verbose("number of candidate entries: %d" % len(candidate_lines))
target_files = search_files_recursively(directory, targeted_filetypes_multiline)
for filepath in target_files:
per_file_result_counter = 0
with open(filepath) as _file:
last_line_was_multiline_string = False
print_verbose("processing %s for single-line strings" % filepath)
for _, line in enumerate(_file.readlines()):
if ignore_single_line_java(line):
last_line_was_multiline_string = False
continue
if ignore_single_line_c(line):
last_line_was_multiline_string = False
continue
if regex_oneline_string_java_ignore1.search(line):
last_line_was_multiline_string = True
if regex_oneline_string.search(line):
# the Java multiline string apparently ends on this line
last_line_was_multiline_string = False
continue
result_general_string = regex_string_general.search(line)
if result_general_string:
# at least one general string is matched, see if it is
# a single-line string
results = regex_oneline_string.findall(line)
for result in results:
if not last_line_was_multiline_string:
candidate_lines.add(result)
per_file_result_counter += 1
last_line_was_multiline_string = False
if len(results) == 0:
last_line_was_multiline_string = True
else:
last_line_was_multiline_string = False
# TODO split this into a separate loop?
results = regex_hex_constant_singleline.findall(line)
for result in results:
# remove the "0x" prefix, add quotes
candidate_lines.add('"%s"' % result[2:])
per_file_result_counter += 1
if per_file_result_counter > 0:
print_verbose("results: %d" % per_file_result_counter)
target_files = search_files_recursively(
directory, targeted_filetypes_multiline_classA
)
for filepath in target_files:
with open(filepath) as _file:
print_verbose("processing %s for C-style multi-line strings" % filepath)
filecontent = _file.read()
multiline_results = regex_c_style_multiline.findall(filecontent)
if len(multiline_results) > 0:
print_verbose("results: %d" % len(multiline_results))
for result in multiline_results:
cleanup = regex_c_intermediary_content.sub("", result)
candidate_lines.add(cleanup)
target_files = search_files_recursively(
directory, targeted_filetypes_multiline_classB
)
for filepath in target_files:
with open(filepath) as _file:
print_verbose("processing %s for Java-style multi-line strings" % filepath)
filecontent = _file.read()
multiline_results = regex_java_style_multiline.findall(filecontent)
if len(multiline_results) > 0:
print_verbose("results: %d" % len(multiline_results))
for result in multiline_results:
cleanup = regex_java_intermediary_content.sub("", result)
candidate_lines.add(cleanup)
return candidate_lines
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("dictionary_output_file", help="output file", type=str)
parser.add_argument("--verbose", action="store_true", help="verbose stdout output")
args = parser.parse_args()
verbose = args.verbose
collected_candidate_lines = recursive_dictionary_extraction(TARGET_DIR)
sorted_candidate_lines = sorted(collected_candidate_lines)
result_lines = []
for candidate_line in sorted_candidate_lines:
if ignore_general(candidate_line):
continue
result_lines.append(
encode_strings_for_dictionary(detect_and_convert_hex(candidate_line))
)
print_verbose("counter_hex_content: %d" % counter_hex_content)
print_verbose(
"counter_wycheproof_hex_reconstruction: %d"
% counter_wycheproof_hex_reconstruction
)
print_verbose("overall deduplicated entries: %d" % len(sorted_candidate_lines))
with open(args.dictionary_output_file, "w") as _file:
for result_line in result_lines:
_file.write("%s\n" % result_line)