trezor-firmware/crypto/fuzzer/extract_fuzzer_dictionary.sh

#!/usr/bin/env bash

# usage: script.sh target-dictionary-filename

# This script searches for interesting strings in the source code and converts
# them into a standard fuzzer dictionary file.
#
# Note that this script is phased out in favor of the more sophisticated
# extract_fuzzer_dictionary.py program

# TODO known issues: the end result has some duplicates in it

TARGET_DIR=../tests
OUTPUT_FILE=${1:-fuzzer_crypto_tests_strings_dictionary1.txt}

multiline_string_search() {
  # TODO the `find` regex behavior is Linux-specific
  find $TARGET_DIR -type f -regextype posix-extended -regex '.*\.(c|h|py|json|java|js)' | xargs cat | perl -p0e 's/"\s*\n\s*\"//smg'
}

# ensure empty file
echo -n "" > $OUTPUT_FILE

# strip multiline strings and extract them
# exclude some hex strings, but allow hex strings with mixed capitalization (Ethereum, rskip60)
multiline_string_search | grep -P -o  "\"[\w ]+\"" | grep -v -P "\"(([0-9a-f][0-9a-f])+|([0-9A-F][0-9A-F])+)\"" | sort | uniq | while read -r line ; do
  echo "$line" >> $OUTPUT_FILE
done

# extract individual BIP39 and SLIP39 words
# TODO are those actually valuable as fuzzer dictionary input?
# grep -r -P -o -h "\"\w+\""  ../slip39_wordlist.h ../bip39_english.h | sort | uniq >> fuzzer_crypto_tests_strings_dictionary1.txt

# extract and convert binary input data from the unit tests
# find each file, cat it, concatenate multiline strings, look for hex strings in quotes
# note that this returns multiple megabyte of result strings due to the large amount
# of test cases in the wycheproof project subfolder
multiline_string_search | grep -P -o "\"([0-9a-fA-F][0-9a-fA-F])+\"" | grep -P -o "([0-9a-fA-F][0-9a-fA-F])+" | sort | uniq | while read -r line ; do
  # turn ascii hex strings AA into \xaa for the fuzzer format and add quotes
  # extra backslash escape due to the bash nesting
  escaped_hex=`echo $line | sed -e 's/../\\\\x&/g'`
  echo "\"$escaped_hex\"" >> $OUTPUT_FILE
done

# search and reassemble BIP39 test seeds that span multiple lines
# find each file, cat it, concatenate multiline strings, look for BIP39 seed combinations with reasonable length
multiline_string_search | grep -Po "(\w{3,10} ){11,23}(\w{3,10})" | sort | uniq | while read -r line ; do
  echo "\"$line\"" >> $OUTPUT_FILE
done
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00			`#!/usr/bin/env bash`

			`# usage: script.sh target-dictionary-filename`

feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`# This script searches for interesting strings in the source code and converts`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00			`# them into a standard fuzzer dictionary file.`
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`#`
			`# Note that this script is phased out in favor of the more sophisticated`
			`# extract_fuzzer_dictionary.py program`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`# TODO known issues: the end result has some duplicates in it`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`TARGET_DIR=../tests`
			`OUTPUT_FILE=${1:-fuzzer_crypto_tests_strings_dictionary1.txt}`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`multiline_string_search() {`
			# TODO the `find` regex behavior is Linux-specific
			`find $TARGET_DIR -type f -regextype posix-extended -regex '.\.(c\|h\|py\|json\|java\|js)' \| xargs cat \| perl -p0e 's/"\s\n\s*\"//smg'`
			`}`

			`# ensure empty file`
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`echo -n "" > $OUTPUT_FILE`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`# strip multiline strings and extract them`
			`# exclude some hex strings, but allow hex strings with mixed capitalization (Ethereum, rskip60)`
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`multiline_string_search \| grep -P -o "\"[\w ]+\"" \| grep -v -P "\"(([0-9a-f][0-9a-f])+\|([0-9A-F][0-9A-F])+)\"" \| sort \| uniq \| while read -r line ; do`
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`echo "$line" >> $OUTPUT_FILE`
			`done`

			`# extract individual BIP39 and SLIP39 words`
			`# TODO are those actually valuable as fuzzer dictionary input?`
			`# grep -r -P -o -h "\"\w+\"" ../slip39_wordlist.h ../bip39_english.h \| sort \| uniq >> fuzzer_crypto_tests_strings_dictionary1.txt`

			`# extract and convert binary input data from the unit tests`
			`# find each file, cat it, concatenate multiline strings, look for hex strings in quotes`
feat(crypto): improve fuzz testing code, documentation 2020-11-30 13:54:34 +00:00			`# note that this returns multiple megabyte of result strings due to the large amount`
			`# of test cases in the wycheproof project subfolder`
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`multiline_string_search \| grep -P -o "\"([0-9a-fA-F][0-9a-fA-F])+\"" \| grep -P -o "([0-9a-fA-F][0-9a-fA-F])+" \| sort \| uniq \| while read -r line ; do`
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`# turn ascii hex strings AA into \xaa for the fuzzer format and add quotes`
			`# extra backslash escape due to the bash nesting`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00			escaped_hex=`echo $line \| sed -e 's/../\\\\x&/g'`
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`echo "\"$escaped_hex\"" >> $OUTPUT_FILE`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00			`done`

crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`# search and reassemble BIP39 test seeds that span multiple lines`
			`# find each file, cat it, concatenate multiline strings, look for BIP39 seed combinations with reasonable length`
feat(crypto): improve trezor-crypto fuzzer, add new dictionary extraction program Introduce fuzzing harnesses for zkp* functions and adapt some differential fuzzing Additional documentation and minor cleanup Add temporary workaround for clang-14 and more explicit Makefile behavior 2022-01-01 13:04:56 +00:00			`multiline_string_search \| grep -Po "(\w{3,10} ){11,23}(\w{3,10})" \| sort \| uniq \| while read -r line ; do`
crypto: new iteration of fuzz testing code, improved dictionary script, minor documentation changes 2021-07-25 21:23:31 +00:00			`echo "\"$line\"" >> $OUTPUT_FILE`
crypto: improve fuzz testing code, harnesses, documentation and scripts 2020-11-30 13:54:34 +00:00			`done`