Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 1 | // Copyright 2016 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | #include <stddef.h> |
| 6 | #include <stdint.h> |
Paul Wankadia | 1d0aa85 | 2016-05-31 21:54:34 +1000 | [diff] [blame] | 7 | #include <map> |
Paul Wankadia | 749d64c | 2018-11-05 21:13:23 -0800 | [diff] [blame] | 8 | #include <memory> |
| 9 | #include <queue> |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 10 | #include <string> |
| 11 | |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 12 | #include "re2/prefilter.h" |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 13 | #include "re2/re2.h" |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 14 | |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 15 | using re2::StringPiece; |
| 16 | using std::string; |
| 17 | |
| 18 | // NOT static, NOT signed. |
| 19 | uint8_t dummy = 0; |
| 20 | |
| 21 | void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { |
| 22 | RE2 re(pattern, options); |
| 23 | if (!re.ok()) |
| 24 | return; |
| 25 | |
Paul Wankadia | eb43800 | 2017-02-17 15:54:27 +1100 | [diff] [blame] | 26 | // Don't waste time fuzzing high-size programs. |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 27 | // They can cause bug reports due to fuzzer timeouts. |
Paul Wankadia | eb43800 | 2017-02-17 15:54:27 +1100 | [diff] [blame] | 28 | int size = re.ProgramSize(); |
Paul Wankadia | 54ca2cd | 2018-10-02 19:42:12 -0700 | [diff] [blame] | 29 | if (size > 9999) |
| 30 | return; |
Paul Wankadia | f94a5b7 | 2018-10-02 01:19:30 -0700 | [diff] [blame] | 31 | int rsize = re.ReverseProgramSize(); |
Paul Wankadia | 54ca2cd | 2018-10-02 19:42:12 -0700 | [diff] [blame] | 32 | if (rsize > 9999) |
Paul Wankadia | eb43800 | 2017-02-17 15:54:27 +1100 | [diff] [blame] | 33 | return; |
| 34 | |
Paul Wankadia | 1d0aa85 | 2016-05-31 21:54:34 +1000 | [diff] [blame] | 35 | // Don't waste time fuzzing high-fanout programs. |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 36 | // They can cause bug reports due to fuzzer timeouts. |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 37 | std::map<int, int> histogram; |
Paul Wankadia | 1d0aa85 | 2016-05-31 21:54:34 +1000 | [diff] [blame] | 38 | int fanout = re.ProgramFanout(&histogram); |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 39 | if (fanout > 9) |
Paul Wankadia | 54ca2cd | 2018-10-02 19:42:12 -0700 | [diff] [blame] | 40 | return; |
Paul Wankadia | f94a5b7 | 2018-10-02 01:19:30 -0700 | [diff] [blame] | 41 | int rfanout = re.ReverseProgramFanout(&histogram); |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 42 | if (rfanout > 9) |
| 43 | return; |
| 44 | |
| 45 | // Don't waste time fuzzing programs with large substrings. |
| 46 | // They can cause bug reports due to fuzzer timeouts when they |
| 47 | // are repetitions (e.g. hundreds of NUL bytes) and matching is |
| 48 | // unanchored. And they aren't interesting for fuzzing purposes. |
Paul Wankadia | 749d64c | 2018-11-05 21:13:23 -0800 | [diff] [blame] | 49 | std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re)); |
| 50 | if (prefilter == nullptr) |
Paul Wankadia | 22caec6 | 2018-10-29 22:58:45 -0700 | [diff] [blame] | 51 | return; |
Paul Wankadia | 749d64c | 2018-11-05 21:13:23 -0800 | [diff] [blame] | 52 | std::queue<re2::Prefilter*> nodes; |
| 53 | nodes.push(prefilter.get()); |
| 54 | while (!nodes.empty()) { |
| 55 | re2::Prefilter* node = nodes.front(); |
| 56 | nodes.pop(); |
| 57 | if (node->op() == re2::Prefilter::ATOM) { |
| 58 | if (node->atom().size() > 9) |
| 59 | return; |
| 60 | } else if (node->op() == re2::Prefilter::AND || |
| 61 | node->op() == re2::Prefilter::OR) { |
| 62 | for (re2::Prefilter* sub : *node->subs()) |
| 63 | nodes.push(sub); |
| 64 | } |
Paul Wankadia | 169debd | 2018-11-05 20:47:31 -0800 | [diff] [blame] | 65 | } |
Paul Wankadia | 1d0aa85 | 2016-05-31 21:54:34 +1000 | [diff] [blame] | 66 | |
Paul Wankadia | f620af7 | 2018-12-09 22:42:42 -0800 | [diff] [blame] | 67 | if (re.NumberOfCapturingGroups() == 0) { |
| 68 | // Avoid early return due to too many arguments. |
| 69 | StringPiece sp = text; |
| 70 | RE2::FullMatch(sp, re); |
| 71 | RE2::PartialMatch(sp, re); |
| 72 | RE2::Consume(&sp, re); |
| 73 | sp = text; // Reset. |
| 74 | RE2::FindAndConsume(&sp, re); |
| 75 | } else { |
| 76 | // Okay, we have at least one capturing group... |
| 77 | // Try conversion for variously typed arguments. |
| 78 | StringPiece sp = text; |
| 79 | short s; |
| 80 | RE2::FullMatch(sp, re, &s); |
| 81 | long l; |
| 82 | RE2::PartialMatch(sp, re, &l); |
| 83 | float f; |
| 84 | RE2::Consume(&sp, re, &f); |
| 85 | sp = text; // Reset. |
| 86 | double d; |
| 87 | RE2::FindAndConsume(&sp, re, &d); |
| 88 | } |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 89 | |
Paul Wankadia | f620af7 | 2018-12-09 22:42:42 -0800 | [diff] [blame] | 90 | string s = string(text); |
| 91 | RE2::Replace(&s, re, ""); |
| 92 | s = string(text); // Reset. |
| 93 | RE2::GlobalReplace(&s, re, ""); |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 94 | |
Paul Wankadia | f620af7 | 2018-12-09 22:42:42 -0800 | [diff] [blame] | 95 | string min, max; |
| 96 | re.PossibleMatchRange(&min, &max, /*maxlen=*/9); |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 97 | |
| 98 | // Exercise some other API functionality. |
Paul Wankadia | f620af7 | 2018-12-09 22:42:42 -0800 | [diff] [blame] | 99 | dummy += re.NamedCapturingGroups().size(); |
| 100 | dummy += re.CapturingGroupNames().size(); |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 101 | dummy += RE2::QuoteMeta(pattern).size(); |
| 102 | } |
| 103 | |
| 104 | // Entry point for libFuzzer. |
| 105 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { |
Paul Wankadia | 4c916c9 | 2018-09-03 03:48:25 -0700 | [diff] [blame] | 106 | if (size == 0 || size > 999) |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 107 | return 0; |
| 108 | |
Paul Wankadia | a817612 | 2019-01-25 01:50:33 -0800 | [diff] [blame^] | 109 | // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. |
| 110 | // Otherwise, we will waste time on inputs that have long runs of various |
Paul Wankadia | 2c220e7 | 2017-10-28 13:15:57 +1100 | [diff] [blame] | 111 | // character classes. The fuzzer has shown itself to be easily capable of |
| 112 | // generating such patterns that fall within the other limits, but result |
| 113 | // in timeouts nonetheless. The marginal cost is high - even more so when |
| 114 | // counted repetition is involved - whereas the marginal benefit is zero. |
Paul Wankadia | a817612 | 2019-01-25 01:50:33 -0800 | [diff] [blame^] | 115 | // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. |
| 116 | int cc = 0; |
Paul Wankadia | 15af9e4 | 2017-10-27 16:06:25 +1100 | [diff] [blame] | 117 | for (size_t i = 0; i < size; i++) { |
Paul Wankadia | bfe2920 | 2018-09-19 01:00:45 -0700 | [diff] [blame] | 118 | if (data[i] == '.') |
Paul Wankadia | a817612 | 2019-01-25 01:50:33 -0800 | [diff] [blame^] | 119 | cc++; |
Paul Wankadia | 4c916c9 | 2018-09-03 03:48:25 -0700 | [diff] [blame] | 120 | if (data[i] != '\\') |
| 121 | continue; |
| 122 | i++; |
| 123 | if (i >= size) |
| 124 | break; |
Paul Wankadia | a817612 | 2019-01-25 01:50:33 -0800 | [diff] [blame^] | 125 | if (data[i] == 'p' || data[i] == 'P' || |
| 126 | data[i] == 'd' || data[i] == 'D' || |
| 127 | data[i] == 's' || data[i] == 'S' || |
| 128 | data[i] == 'w' || data[i] == 'W') |
| 129 | cc++; |
Paul Wankadia | 15af9e4 | 2017-10-27 16:06:25 +1100 | [diff] [blame] | 130 | } |
Paul Wankadia | a817612 | 2019-01-25 01:50:33 -0800 | [diff] [blame^] | 131 | if (cc > 9) |
Paul Wankadia | 15af9e4 | 2017-10-27 16:06:25 +1100 | [diff] [blame] | 132 | return 0; |
| 133 | |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 134 | // The one-at-a-time hash by Bob Jenkins. |
| 135 | uint32_t hash = 0; |
| 136 | for (size_t i = 0; i < size; i++) { |
| 137 | hash += data[i]; |
| 138 | hash += (hash << 10); |
| 139 | hash ^= (hash >> 6); |
| 140 | } |
| 141 | hash += (hash << 3); |
| 142 | hash ^= (hash >> 11); |
| 143 | hash += (hash << 15); |
| 144 | |
| 145 | RE2::Options options; |
| 146 | options.set_log_errors(false); |
Paul Wankadia | 4be7a14 | 2017-10-13 16:29:01 +1100 | [diff] [blame] | 147 | options.set_max_mem(64 << 20); |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 148 | options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1 |
| 149 | : RE2::Options::EncodingUTF8); |
| 150 | options.set_posix_syntax(hash & 2); |
| 151 | options.set_longest_match(hash & 4); |
| 152 | options.set_literal(hash & 8); |
| 153 | options.set_never_nl(hash & 16); |
| 154 | options.set_dot_nl(hash & 32); |
| 155 | options.set_never_capture(hash & 64); |
| 156 | options.set_case_sensitive(hash & 128); |
| 157 | options.set_perl_classes(hash & 256); |
| 158 | options.set_word_boundary(hash & 512); |
| 159 | options.set_one_line(hash & 1024); |
| 160 | |
| 161 | const char* ptr = reinterpret_cast<const char*>(data); |
| 162 | int len = static_cast<int>(size); |
| 163 | |
| 164 | StringPiece pattern(ptr, len); |
| 165 | StringPiece text(ptr, len); |
| 166 | Test(pattern, options, text); |
| 167 | |
Paul Wankadia | bfa5864 | 2016-05-24 18:36:12 +1000 | [diff] [blame] | 168 | return 0; |
| 169 | } |