trezor-firmware/crypto/fuzzer/extract_fuzzer_dictionary.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This experimental program is designed to extract a subset of interesting test
case snippets from the trezor-crypto test directory and output them as a
standard fuzzer dictionary file.

The program is built on quick-and-dirty regex matching that is known to be
incorrect for parsing code files, but is considered "good enough" for this
specific purpose.
Note that there are target-specific configurations and internal filter settings.
"""

import argparse
import binascii
import glob
import re

# re2 is considered for future use
# it requires a system installation and the google-re2 python package
# import re2


# Expected target format for strings in code:
# Most strings are defined in the general form "example"
# There are a few test vectors in crypto/tests/wycheproof/javascript/EcUtil.js
# with 'example' style string definitions, these are ignored for now

TARGET_DIR = "../tests"

# intentionally excluded file types that currently do not provide enough value:
# *.js, *.md, *.sh, *.html and others from the wycheproof subdirectory

targeted_filetypes_multiline_classA = ("*.c", "*.h", "*.py")
# Java files have different multiline strings that are handled differently
targeted_filetypes_multiline_classB = ("*.java",)
targeted_filetypes_multiline = (
    targeted_filetypes_multiline_classA + targeted_filetypes_multiline_classB
)

# files without multiline string content
# Note: consider switching to actual JSON parsing?
# Note: the wycheproof repository has a number of test cases for other
# cryptography such as DSA and RSA which are currently less interesting for the
# fuzzer dictionary and therefore excluded
targeted_filetypes_singleline = (
    "aes*.json",
    "ecdh*.json",
    "ecdsa*.json",
    "x25519*.json",
    "chacha20*.json",
    "kw*.json",
)

verbose = False

# patterns to extract
# singleline:
# "4a1e76f133afb"
# 0xAF8BBDFE8CDD5 and 0x0488b21e
# m/0'/2147483647'/1'/2147483646'/2' in test_check.c via m/[\d'/]+
#
# multiline:
# "fffc" \n "99"
# "dpubZ9169K" \n "bTYbcY"
# "\x65\xf9" \\n  "\xa0\x6a"
# { 0x086d8bd5, 0x1018f82f, \n 0xc55ece} , see rg "0x([a-zA-Z0-9])+"

# patterns to ignore
# lines with print statements
# lines with exceptions
# comments and other metadata in the testvector JSON files
# filenames
# import statements and other package names

# patterns to investigate further
# public keys with the form BEGIN PUBLIC KEY
# TODO "abc" + "def" string concatenation on the same line without newline
# strings in comments

# dictionary text export file format
# general description:
# https://github.com/AFLplusplus/AFLplusplus/blob/stable/dictionaries/README.md
#
# the exported file is primarly designed for use with a recent libFuzzer version
# and is known to be partially incompatible with other fuzzers that impose
# other limitations
#
# known incompatibilities:
# * honggfuzz only reads a limited number of dictionary entries (8192 with version 2.5)
# * afl++ only reads line content with up to 128 byte

# match everything in quotes that doesn't have an internal quote character and
# at least one internal character
regex_string_general_definition = r"\"[^\"]+\""
regex_string_general = re.compile(regex_string_general_definition)
# the capturing group ignores prefix and suffix outside of the quotes
# Note that this is prone to matching the last line of a C-style multiline string,
# which is addressed via extra state handling during the file processing
regex_oneline_string = re.compile(
    r"(" + regex_string_general_definition + r")\s*[\,\)]+"
)
# ignore lines that have a "+" character preceding a string
regex_oneline_string_java_ignore1 = re.compile(r"^\s*\+\s*\"")

regex_hex_character_segment_inner_definition = "[0-9a-fA-F]+"
regex_hex_character_input_complete = re.compile(
    '^"' + regex_hex_character_segment_inner_definition + '"$'
)
regex_hex_character_input_inner = re.compile(
    regex_hex_character_segment_inner_definition
)
# most constants are preceded by a space, but some have a "(" "[" or "{" before them
regex_hex_constant_singleline = re.compile(r"(?<=\(|\[|\{| )0x[a-fA-F0-9]+")

regex_c_style_multiline = re.compile(r"(?:\".+\"\s*\n\s*)+(?:\".+\")", re.MULTILINE)
regex_c_intermediary_content = re.compile(r"\"\s*\n\s*\"", re.MULTILINE)
# TODO how to prevent matching in the middle of a multi-line string concatenation?
# negative lookbehind for "+" is not possible generically and
# (?<!\+ ) and similar patterns are too static

regex_java_style_multiline = re.compile(
    r"(?:\".+\"\s*\n\s*\+\s*)+(?:\".+\")", re.MULTILINE
)
regex_java_intermediary_content = re.compile(r"\"\s*\n\s*\+\s*\"", re.MULTILINE)

regex_text_newline = re.compile(r"\\n")

# primitive regex that catches most filenames in the data set
regex_filename_heuristic = re.compile(r"\.[a-zA-Z]+")

counter_hex_content = 0
counter_wycheproof_hex_reconstruction = 0

# TODO add '"curve"' to capture algorithm names?
allowlist_keywords_json = (
    '"uncompressed"',
    '"wx"',
    '"wy"',
    '"msg"',
    '"sig"',
    '"key"',
    '"iv"',
    '"ct"',
    '"aad"',
    '"tag"',
    '"public"',
    '"private"',
    '"shared"',
    '"padding"',
    '"x"',
    '"d"',
)

# TODO the "keyPem" entry is only a workaround for an encoding issue
ignore_keywords_java = (
    "println(",
    "Exception(",
    '"keyPem"',
)
ignore_keywords_c = ("printf(",)


def ignore_single_line_json(data):
    """return True if the input should be ignored"""
    # ignore everything that is not matched by the allowlist
    for keyword in allowlist_keywords_json:
        if data.find(keyword) > -1:
            return False
    return True


def ignore_single_line_java(data):
    """return True if the input should be ignored"""
    for keyword in ignore_keywords_java:
        if data.find(keyword) > -1:
            return True
    return False


def ignore_single_line_c(data):
    """return True if the input should be ignored"""
    for keyword in ignore_keywords_c:
        if data.find(keyword) > -1:
            return True
    return False


def ignore_general(data):
    """return True if the input should be ignored"""
    if regex_filename_heuristic.search(data):
        return True
    return False


def encode_strings_for_dictionary(data):
    """
    Assumes that inputs are already in string quotes

    Handles dictionary-specific encoding steps
    """
    # libfuzzer does not like "\n" string patterns in dictionary files, replace
    # it with an encoded newline
    data = regex_text_newline.sub("\\\\x0a", data)
    return data


def detect_and_convert_hex(data):
    """
    Convert hex strings

    Directly pass through non-hex content
    """
    global counter_hex_content
    global counter_wycheproof_hex_reconstruction
    match_result1 = regex_hex_character_input_complete.search(data)
    if match_result1:

        match_result2 = regex_hex_character_input_inner.search(match_result1.string)
        isolated_substring = match_result2.group(0)
        if len(isolated_substring) % 2 == 1:
            # Note: the test cases in the wycheproof testvector JSON files have
            # a custom binary hex format to represent keys
            # among other things, this results in hex strings with an uneven
            # number of characters
            # see tests/wycheproof/java/com/google/security/wycheproof/JsonUtil.java
            # specifically the asBigInteger() function for more information
            if isolated_substring[0] >= "0" and isolated_substring[0] <= "7":
                isolated_substring = "0" + isolated_substring
            else:
                isolated_substring = "f" + isolated_substring
            counter_wycheproof_hex_reconstruction += 1

        converted_result = ""
        try:
            # test error-free conversion to binary
            binascii.unhexlify(isolated_substring)
            hex_with_c_style_formatting = ""
            pos = 0
            while pos < len(isolated_substring) - 1:
                hex_with_c_style_formatting += "\\x" + isolated_substring[pos : pos + 2]
                pos += 2

            converted_result = '"%s"' % hex_with_c_style_formatting
        # TODO binascii.Incomplete exception also relevant?
        except binascii.Error:
            # default to the original input
            return data
        counter_hex_content += 1
        return converted_result
    return data


def search_files_recursively(directory, filetype_glob):
    """returns glob search results"""
    target_files = []
    print_verbose("searching in %s" % directory)
    for filetype in filetype_glob:
        print_verbose("searching for %s" % filetype)
        target_files.extend(glob.glob(f"{directory}/**/{filetype}", recursive=True))
    return target_files


def print_verbose(text):
    """print wrapper"""
    if verbose:
        print(text)


def recursive_dictionary_extraction(directory):
    """handle the central extraction logic"""
    # TODO split this function up into subfunctions
    global counter_hex_content
    # handle as a set structure to de-duplicate results automatically
    candidate_lines = set()

    target_files = search_files_recursively(directory, targeted_filetypes_singleline)
    for filepath in target_files:
        per_file_result_counter = 0
        with open(filepath) as _file:
            print_verbose("processing %s" % filepath)
            for _, line in enumerate(_file.readlines()):
                if ignore_single_line_json(line):
                    continue
                results = regex_oneline_string.findall(line)
                for result in results:
                    candidate_lines.add(result)
                    per_file_result_counter += 1
            if per_file_result_counter > 0:
                print_verbose("results: %d" % per_file_result_counter)

    print_verbose("number of candidate entries: %d" % len(candidate_lines))

    target_files = search_files_recursively(directory, targeted_filetypes_multiline)
    for filepath in target_files:
        per_file_result_counter = 0
        with open(filepath) as _file:
            last_line_was_multiline_string = False
            print_verbose("processing %s for single-line strings" % filepath)
            for _, line in enumerate(_file.readlines()):
                if ignore_single_line_java(line):
                    last_line_was_multiline_string = False
                    continue
                if ignore_single_line_c(line):
                    last_line_was_multiline_string = False
                    continue
                if regex_oneline_string_java_ignore1.search(line):
                    last_line_was_multiline_string = True
                    if regex_oneline_string.search(line):
                        # the Java multiline string apparently ends on this line
                        last_line_was_multiline_string = False
                    continue

                result_general_string = regex_string_general.search(line)
                if result_general_string:
                    # at least one general string is matched, see if it is
                    # a single-line string
                    results = regex_oneline_string.findall(line)
                    for result in results:
                        if not last_line_was_multiline_string:
                            candidate_lines.add(result)
                            per_file_result_counter += 1
                        last_line_was_multiline_string = False
                    if len(results) == 0:
                        last_line_was_multiline_string = True
                else:
                    last_line_was_multiline_string = False

                # TODO split this into a separate loop?
                results = regex_hex_constant_singleline.findall(line)
                for result in results:
                    # remove the "0x" prefix, add quotes
                    candidate_lines.add('"%s"' % result[2:])
                    per_file_result_counter += 1

            if per_file_result_counter > 0:
                print_verbose("results: %d" % per_file_result_counter)

    target_files = search_files_recursively(
        directory, targeted_filetypes_multiline_classA
    )

    for filepath in target_files:
        with open(filepath) as _file:
            print_verbose("processing %s for C-style multi-line strings" % filepath)
            filecontent = _file.read()
            multiline_results = regex_c_style_multiline.findall(filecontent)
            if len(multiline_results) > 0:
                print_verbose("results: %d" % len(multiline_results))
            for result in multiline_results:
                cleanup = regex_c_intermediary_content.sub("", result)
                candidate_lines.add(cleanup)

    target_files = search_files_recursively(
        directory, targeted_filetypes_multiline_classB
    )

    for filepath in target_files:
        with open(filepath) as _file:
            print_verbose("processing %s for Java-style multi-line strings" % filepath)
            filecontent = _file.read()
            multiline_results = regex_java_style_multiline.findall(filecontent)
            if len(multiline_results) > 0:
                print_verbose("results: %d" % len(multiline_results))
            for result in multiline_results:
                cleanup = regex_java_intermediary_content.sub("", result)
                candidate_lines.add(cleanup)

    return candidate_lines


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dictionary_output_file", help="output file", type=str)
    parser.add_argument("--verbose", action="store_true", help="verbose stdout output")

    args = parser.parse_args()
    verbose = args.verbose

    collected_candidate_lines = recursive_dictionary_extraction(TARGET_DIR)
    sorted_candidate_lines = sorted(collected_candidate_lines)
    result_lines = []
    for candidate_line in sorted_candidate_lines:
        if ignore_general(candidate_line):
            continue
        result_lines.append(
            encode_strings_for_dictionary(detect_and_convert_hex(candidate_line))
        )

    print_verbose("counter_hex_content: %d" % counter_hex_content)
    print_verbose(
        "counter_wycheproof_hex_reconstruction: %d"
        % counter_wycheproof_hex_reconstruction
    )
    print_verbose("overall deduplicated entries: %d" % len(sorted_candidate_lines))

    with open(args.dictionary_output_file, "w") as _file:
        for result_line in result_lines:
            _file.write("%s\n" % result_line)