diff options
author | Ammar Askar <ammar@ammaraskar.com> | 2019-06-30 05:54:43 (GMT) |
---|---|---|
committer | Gregory P. Smith <greg@krypto.org> | 2019-06-30 05:54:42 (GMT) |
commit | 5cbbbd73a6acb6f96f5d6646aa7498d3dfb1706d (patch) | |
tree | d8d1f58554630a3a4b5609ce50b7b17ee8d76b7f /Modules/_xxtestfuzz | |
parent | eb97b9211e7c99841d6cae8c63893b3525d5a401 (diff) | |
download | cpython-5cbbbd73a6acb6f96f5d6646aa7498d3dfb1706d.zip cpython-5cbbbd73a6acb6f96f5d6646aa7498d3dfb1706d.tar.gz cpython-5cbbbd73a6acb6f96f5d6646aa7498d3dfb1706d.tar.bz2 |
bpo-29505: Add more fuzzing for re.compile, re.load and csv.reader (GH-14255)
Add more fuzz testing for re.compile, re.load and csv.reader
Diffstat (limited to 'Modules/_xxtestfuzz')
-rw-r--r-- | Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict | 219 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv | bin | 0 -> 118 bytes | |||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links | 1 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters | 1 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn | 1 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number | 1 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzz_tests.txt | 3 | ||||
-rw-r--r-- | Modules/_xxtestfuzz/fuzzer.c | 281 |
8 files changed, 491 insertions, 16 deletions
diff --git a/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict b/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict new file mode 100644 index 0000000..961306a --- /dev/null +++ b/Modules/_xxtestfuzz/dictionaries/fuzz_sre_compile.dict @@ -0,0 +1,219 @@ +"?" +"abc" +"()" +"[]" +"abc|def" +"abc|def|ghi" +"^xxx$" +"ab\\b\\d\\bcd" +"\\w|\\d" +"a*?" +"abc+" +"abc+?" +"xyz?" +"xyz??" +"xyz{0,1}" +"xyz{0,1}?" +"xyz{93}" +"xyz{1,32}" +"xyz{1,32}?" +"xyz{1,}" +"xyz{1,}?" +"a\\fb\\nc\\rd\\te\\vf" +"a\\nb\\bc" +"(?:foo)" +"(?: foo )" +"foo|(bar|baz)|quux" +"foo(?=bar)baz" +"foo(?!bar)baz" +"foo(?<=bar)baz" +"foo(?<!bar)baz" +"()" +"(?=)" +"[]" +"[x]" +"[xyz]" +"[a-zA-Z0-9]" +"[-123]" +"[^123]" +"]" +"}" +"[a-b-c]" +"[x\\dz]" +"[\\d-z]" +"[\\d-\\d]" +"[z-\\d]" +"\\cj\\cJ\\ci\\cI\\ck\\cK" +"\\c!" +"\\c_" +"\\c~" +"[\\c!]" +"[\\c_]" +"[\\c~]" +"[\\ca]" +"[\\cz]" +"[\\cA]" +"[\\cZ]" +"[\\c1]" +"\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ " +"[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]" +"\\8" +"\\9" +"\\11" +"\\11a" +"\\011" +"\\118" +"\\111" +"\\1111" +"(x)(x)(x)\\1" +"(x)(x)(x)\\2" +"(x)(x)(x)\\3" +"(x)(x)(x)\\4" +"(x)(x)(x)\\1*" +"(x)(x)(x)\\3*" +"(x)(x)(x)\\4*" +"(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10" +"(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11" +"(a)\\1" +"(a\\1)" +"(\\1a)" +"(\\2)(\\1)" +"(?=a){0,10}a" +"(?=a){1,10}a" +"(?=a){9,10}a" +"(?!a)?a" +"\\1(a)" +"(?!(a))\\1" +"(?!\\1(a\\1)\\1)\\1" +"\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1" +"[\\0]" +"[\\11]" +"[\\11a]" +"[\\011]" +"[\\00011]" +"[\\118]" +"[\\111]" +"[\\1111]" +"\\x60" +"\\x3z" +"\\c" +"\\u0034" +"\\u003z" +"foo[z]*" +"\\u{12345}" +"\\u{12345}\\u{23456}" +"\\u{12345}{3}" +"\\u{12345}*" +"\\ud808\\udf45*" +"[\\ud808\\udf45-\\ud809\\udccc]" +"a" +"a|b" +"a\\n" +"a$" +"a\\b!" +"a\\Bb" +"a*?" +"a?" +"a??" +"a{0,1}?" +"a{1,2}?" +"a+?" +"(a)" +"(a)\\1" +"(\\1a)" +"\\1(a)" +"a\\s" +"a\\S" +"a\\D" +"a\\w" +"a\\W" +"a." +"a\\q" +"a[a]" +"a[^a]" +"a[a-z]" +"a(?:b)" +"a(?=b)" +"a(?!b)" +"\\x60" +"\\u0060" +"\\cA" +"\\q" +"\\1112" +"(a)\\1" +"(?!a)?a\\1" +"(?:(?=a))a\\1" +"a{}" +"a{,}" +"a{" +"a{z}" +"a{12z}" +"a{12," +"a{12,3b" +"{}" +"{,}" +"{" +"{z}" +"{1z}" +"{12," +"{12,3b" +"a" +"abc" +"a[bc]d" +"a|bc" +"ab|c" +"a||bc" +"(?:ab)" +"(?:ab|cde)" +"(?:ab)|cde" +"(ab)" +"(ab|cde)" +"(ab)\\1" +"(ab|cde)\\1" +"(?:ab)?" +"(?:ab)+" +"a?" +"a+" +"a??" +"a*?" +"a+?" +"(?:a?)?" +"(?:a+)?" +"(?:a?)+" +"(?:a*)+" +"(?:a+)+" +"(?:a?)*" +"(?:a*)*" +"(?:a+)*" +"a{0}" +"(?:a+){0,0}" +"a*b" +"a+b" +"a*b|c" +"a+b|c" +"(?:a{5,1000000}){3,1000000}" +"(?:ab){4,7}" +"a\\bc" +"a\\sc" +"a\\Sc" +"a(?=b)c" +"a(?=bbb|bb)c" +"a(?!bbb|bb)c" +"\xe2\x81\xa3" +"[\xe2\x81\xa3]" +"\xed\xb0\x80" +"\xed\xa0\x80" +"(\xed\xb0\x80)\x01" +"((\xed\xa0\x80))\x02" +"\xf0\x9f\x92\xa9" +"\x01" +"\x0f" +"[-\xf0\x9f\x92\xa9]+" +"[\xf0\x9f\x92\xa9-\xf4\x8f\xbf\xbf]" +"(?<=)" +"(?<=a)" +"(?<!)" +"(?<!a)" +"(?<a>)" +"(?<a>.)" +"(?<a>.)\\k<a>" diff --git a/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv b/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv Binary files differnew file mode 100644 index 0000000..8b7887d --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_csv_reader_corpus/test.csv diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links new file mode 100644 index 0000000..d99247c --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/anchor_links @@ -0,0 +1 @@ +XX<a\s*href=(.*?)[\s|>] diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters new file mode 100644 index 0000000..0c67ee7 --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/characters @@ -0,0 +1 @@ +XX^(Tim|Robert)\s+the\s+(Enchanter|Shrubber)$ diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn new file mode 100644 index 0000000..cce8919 --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/isbn @@ -0,0 +1 @@ +XX/((978[\--– ])?[0-9][0-9\--– ]{10}[\--– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])/ diff --git a/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number new file mode 100644 index 0000000..1e2efc5 --- /dev/null +++ b/Modules/_xxtestfuzz/fuzz_sre_compile_corpus/phone_number @@ -0,0 +1 @@ +XX(\+1|1)?[ \-\.]?\(?(?<areacode>[0-9]{3})\)?[ \-\.]?(?<prefix>[0-9]{3})[ \-\.]?(?<number>[0-9]{4})[ \.]*(ext|x)?[ \.]*(?<extension>[0-9]{0,5}) diff --git a/Modules/_xxtestfuzz/fuzz_tests.txt b/Modules/_xxtestfuzz/fuzz_tests.txt index f012129..9d330a6 100644 --- a/Modules/_xxtestfuzz/fuzz_tests.txt +++ b/Modules/_xxtestfuzz/fuzz_tests.txt @@ -2,3 +2,6 @@ fuzz_builtin_float fuzz_builtin_int fuzz_builtin_unicode fuzz_json_loads +fuzz_sre_compile +fuzz_sre_match +fuzz_csv_reader diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c index e862a99..16104e4 100644 --- a/Modules/_xxtestfuzz/fuzzer.c +++ b/Modules/_xxtestfuzz/fuzzer.c @@ -81,8 +81,17 @@ static int fuzz_builtin_unicode(const char* data, size_t size) { #define MAX_JSON_TEST_SIZE 0x10000 -/* Initialized in LLVMFuzzerTestOneInput */ PyObject* json_loads_method = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_json_loads() { + /* Import json.loads */ + PyObject* json_module = PyImport_ImportModule("json"); + if (json_module == NULL) { + return 0; + } + json_loads_method = PyObject_GetAttrString(json_module, "loads"); + return json_loads_method != NULL; +} /* Fuzz json.loads(x) */ static int fuzz_json_loads(const char* data, size_t size) { /* Since python supports arbitrarily large ints in JSON, @@ -96,22 +105,227 @@ static int fuzz_json_loads(const char* data, size_t size) { return 0; } PyObject* parsed = PyObject_CallFunctionObjArgs(json_loads_method, input_bytes, NULL); + if (parsed == NULL) { + /* Ignore ValueError as the fuzzer will more than likely + generate some invalid json and values */ + if (PyErr_ExceptionMatches(PyExc_ValueError) || + /* Ignore RecursionError as the fuzzer generates long sequences of + arrays such as `[[[...` */ + PyErr_ExceptionMatches(PyExc_RecursionError) || + /* Ignore unicode errors, invalid byte sequences are common */ + PyErr_ExceptionMatches(PyExc_UnicodeDecodeError) + ) { + PyErr_Clear(); + } + } + Py_DECREF(input_bytes); + Py_XDECREF(parsed); + return 0; +} + +#define MAX_RE_TEST_SIZE 0x10000 + +PyObject* sre_compile_method = NULL; +PyObject* sre_error_exception = NULL; +int SRE_FLAG_DEBUG = 0; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_sre_compile() { + /* Import sre_compile.compile and sre.error */ + PyObject* sre_compile_module = PyImport_ImportModule("sre_compile"); + if (sre_compile_module == NULL) { + return 0; + } + sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile"); + if (sre_compile_method == NULL) { + return 0; + } + + PyObject* sre_constants = PyImport_ImportModule("sre_constants"); + if (sre_constants == NULL) { + return 0; + } + sre_error_exception = PyObject_GetAttrString(sre_constants, "error"); + if (sre_error_exception == NULL) { + return 0; + } + PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG"); + if (debug_flag == NULL) { + return 0; + } + SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag); + return 1; +} +/* Fuzz _sre.compile(x) */ +static int fuzz_sre_compile(const char* data, size_t size) { + /* Ignore really long regex patterns that will timeout the fuzzer */ + if (size > MAX_RE_TEST_SIZE) { + return 0; + } + /* We treat the first 2 bytes of the input as a number for the flags */ + if (size < 2) { + return 0; + } + uint16_t flags = ((uint16_t*) data)[0]; + /* We remove the SRE_FLAG_DEBUG if present. This is because it + prints to stdout which greatly decreases fuzzing speed */ + flags &= ~SRE_FLAG_DEBUG; + + /* Pull the pattern from the remaining bytes */ + PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2); + if (pattern_bytes == NULL) { + return 0; + } + PyObject* flags_obj = PyLong_FromUnsignedLong(flags); + if (flags_obj == NULL) { + Py_DECREF(pattern_bytes); + return 0; + } + + /* compiled = _sre.compile(data[2:], data[0:2] */ + PyObject* compiled = PyObject_CallFunctionObjArgs( + sre_compile_method, pattern_bytes, flags_obj, NULL); /* Ignore ValueError as the fuzzer will more than likely - generate some invalid json and values */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { + generate some invalid combination of flags */ + if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { PyErr_Clear(); } - /* Ignore RecursionError as the fuzzer generates long sequences of - arrays such as `[[[...` */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_RecursionError)) { + /* Ignore some common errors thrown by sre_parse: + Overflow, Assertion and Index */ + if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) || + PyErr_ExceptionMatches(PyExc_AssertionError) || + PyErr_ExceptionMatches(PyExc_IndexError)) + ) { PyErr_Clear(); } - /* Ignore unicode errors, invalid byte sequences are common */ - if (parsed == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { + /* Ignore re.error */ + if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) { PyErr_Clear(); } - Py_DECREF(input_bytes); - Py_XDECREF(parsed); + + Py_DECREF(pattern_bytes); + Py_DECREF(flags_obj); + Py_XDECREF(compiled); + return 0; +} + +/* Some random patterns used to test re.match. + Be careful not to add catostraphically slow regexes here, we want to + excercise the matching code without causing timeouts.*/ +static const char* regex_patterns[] = { + ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]", + "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?", + "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$", + "(?:a*)*", "a{1,2}?" +}; +const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]); +PyObject** compiled_patterns = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_sre_match() { + PyObject* re_module = PyImport_ImportModule("re"); + if (re_module == NULL) { + return 0; + } + compiled_patterns = (PyObject**) PyMem_RawMalloc( + sizeof(PyObject*) * NUM_PATTERNS); + if (compiled_patterns == NULL) { + PyErr_NoMemory(); + return 0; + } + + /* Precompile all the regex patterns on the first run for faster fuzzing */ + for (size_t i = 0; i < NUM_PATTERNS; i++) { + PyObject* compiled = PyObject_CallMethod( + re_module, "compile", "y", regex_patterns[i]); + /* Bail if any of the patterns fail to compile */ + if (compiled == NULL) { + return 0; + } + compiled_patterns[i] = compiled; + } + return 1; +} +/* Fuzz re.match(x) */ +static int fuzz_sre_match(const char* data, size_t size) { + if (size < 1 || size > MAX_RE_TEST_SIZE) { + return 0; + } + /* Use the first byte as a uint8_t specifying the index of the + regex to use */ + unsigned char idx = (unsigned char) data[0]; + idx = idx % NUM_PATTERNS; + + /* Pull the string to match from the remaining bytes */ + PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1); + if (to_match == NULL) { + return 0; + } + + PyObject* pattern = compiled_patterns[idx]; + PyObject* match_callable = PyObject_GetAttrString(pattern, "match"); + + PyObject* matches = PyObject_CallFunctionObjArgs(match_callable, to_match, NULL); + + Py_XDECREF(matches); + Py_DECREF(match_callable); + Py_DECREF(to_match); + return 0; +} + +#define MAX_CSV_TEST_SIZE 0x10000 +PyObject* csv_module = NULL; +PyObject* csv_error = NULL; +/* Called by LLVMFuzzerTestOneInput for initialization */ +static int init_csv_reader() { + /* Import csv and csv.Error */ + csv_module = PyImport_ImportModule("csv"); + if (csv_module == NULL) { + return 0; + } + csv_error = PyObject_GetAttrString(csv_module, "Error"); + return csv_error != NULL; +} +/* Fuzz csv.reader([x]) */ +static int fuzz_csv_reader(const char* data, size_t size) { + if (size < 1 || size > MAX_CSV_TEST_SIZE) { + return 0; + } + /* Ignore non null-terminated strings since _csv can't handle + embeded nulls */ + if (memchr(data, '\0', size) == NULL) { + return 0; + } + + PyObject* s = PyUnicode_FromString(data); + /* Ignore exceptions until we have a valid string */ + if (s == NULL) { + PyErr_Clear(); + return 0; + } + + /* Split on \n so we can test multiple lines */ + PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n"); + if (lines == NULL) { + Py_DECREF(s); + return 0; + } + + PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines); + if (reader) { + /* Consume all of the reader as an iterator */ + PyObject* parsed_line; + while ((parsed_line = PyIter_Next(reader))) { + Py_DECREF(parsed_line); + } + } + + /* Ignore csv.Error because we're probably going to generate + some bad files (embeded new-lines, unterminated quotes etc) */ + if (PyErr_ExceptionMatches(csv_error)) { + PyErr_Clear(); + } + + Py_XDECREF(reader); + Py_DECREF(s); return 0; } @@ -152,12 +366,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { initialize CPython ourselves on the first run. */ Py_InitializeEx(0); } -#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) - if (json_loads_method == NULL) { - PyObject* json_module = PyImport_ImportModule("json"); - json_loads_method = PyObject_GetAttrString(json_module, "loads"); - } -#endif int rv = 0; @@ -171,7 +379,48 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { rv |= _run_fuzz(data, size, fuzz_builtin_unicode); #endif #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) + static int JSON_LOADS_INITIALIZED = 0; + if (!JSON_LOADS_INITIALIZED && !init_json_loads()) { + PyErr_Print(); + abort(); + } else { + JSON_LOADS_INITIALIZED = 1; + } + rv |= _run_fuzz(data, size, fuzz_json_loads); #endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile) + static int SRE_COMPILE_INITIALIZED = 0; + if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) { + PyErr_Print(); + abort(); + } else { + SRE_COMPILE_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_sre_compile); +#endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match) + static int SRE_MATCH_INITIALIZED = 0; + if (!SRE_MATCH_INITIALIZED && !init_sre_match()) { + PyErr_Print(); + abort(); + } else { + SRE_MATCH_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_sre_match); +#endif +#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader) + static int CSV_READER_INITIALIZED = 0; + if (!CSV_READER_INITIALIZED && !init_csv_reader()) { + PyErr_Print(); + abort(); + } else { + CSV_READER_INITIALIZED = 1; + } + + rv |= _run_fuzz(data, size, fuzz_csv_reader); +#endif return rv; } |