blob: 83971a1bbc0da8166028ab2b29425891ef64213c [file] [log] [blame]
Paul Wankadiabfa58642016-05-24 18:36:12 +10001// Copyright 2016 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include <stddef.h>
6#include <stdint.h>
Paul Wankadia1d0aa852016-05-31 21:54:34 +10007#include <map>
Paul Wankadia749d64c2018-11-05 21:13:23 -08008#include <memory>
9#include <queue>
Paul Wankadiabfa58642016-05-24 18:36:12 +100010#include <string>
11
Paul Wankadia22caec62018-10-29 22:58:45 -070012#include "re2/prefilter.h"
Paul Wankadiabfa58642016-05-24 18:36:12 +100013#include "re2/re2.h"
Paul Wankadiabfa58642016-05-24 18:36:12 +100014
Paul Wankadiabfa58642016-05-24 18:36:12 +100015using re2::StringPiece;
16using std::string;
17
18// NOT static, NOT signed.
19uint8_t dummy = 0;
20
21void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
22 RE2 re(pattern, options);
23 if (!re.ok())
24 return;
25
Paul Wankadiaeb438002017-02-17 15:54:27 +110026 // Don't waste time fuzzing high-size programs.
Paul Wankadia22caec62018-10-29 22:58:45 -070027 // They can cause bug reports due to fuzzer timeouts.
Paul Wankadiaeb438002017-02-17 15:54:27 +110028 int size = re.ProgramSize();
Paul Wankadia54ca2cd2018-10-02 19:42:12 -070029 if (size > 9999)
30 return;
Paul Wankadiaf94a5b72018-10-02 01:19:30 -070031 int rsize = re.ReverseProgramSize();
Paul Wankadia54ca2cd2018-10-02 19:42:12 -070032 if (rsize > 9999)
Paul Wankadiaeb438002017-02-17 15:54:27 +110033 return;
34
Paul Wankadia1d0aa852016-05-31 21:54:34 +100035 // Don't waste time fuzzing high-fanout programs.
Paul Wankadia22caec62018-10-29 22:58:45 -070036 // They can cause bug reports due to fuzzer timeouts.
Paul Wankadiaee55a8f2016-08-02 21:49:57 +100037 std::map<int, int> histogram;
Paul Wankadia1d0aa852016-05-31 21:54:34 +100038 int fanout = re.ProgramFanout(&histogram);
Paul Wankadia22caec62018-10-29 22:58:45 -070039 if (fanout > 9)
Paul Wankadia54ca2cd2018-10-02 19:42:12 -070040 return;
Paul Wankadiaf94a5b72018-10-02 01:19:30 -070041 int rfanout = re.ReverseProgramFanout(&histogram);
Paul Wankadia22caec62018-10-29 22:58:45 -070042 if (rfanout > 9)
43 return;
44
45 // Don't waste time fuzzing programs with large substrings.
46 // They can cause bug reports due to fuzzer timeouts when they
47 // are repetitions (e.g. hundreds of NUL bytes) and matching is
48 // unanchored. And they aren't interesting for fuzzing purposes.
Paul Wankadia749d64c2018-11-05 21:13:23 -080049 std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
50 if (prefilter == nullptr)
Paul Wankadia22caec62018-10-29 22:58:45 -070051 return;
Paul Wankadia749d64c2018-11-05 21:13:23 -080052 std::queue<re2::Prefilter*> nodes;
53 nodes.push(prefilter.get());
54 while (!nodes.empty()) {
55 re2::Prefilter* node = nodes.front();
56 nodes.pop();
57 if (node->op() == re2::Prefilter::ATOM) {
58 if (node->atom().size() > 9)
59 return;
60 } else if (node->op() == re2::Prefilter::AND ||
61 node->op() == re2::Prefilter::OR) {
62 for (re2::Prefilter* sub : *node->subs())
63 nodes.push(sub);
64 }
Paul Wankadia169debd2018-11-05 20:47:31 -080065 }
Paul Wankadia1d0aa852016-05-31 21:54:34 +100066
Paul Wankadiaf620af72018-12-09 22:42:42 -080067 if (re.NumberOfCapturingGroups() == 0) {
68 // Avoid early return due to too many arguments.
69 StringPiece sp = text;
70 RE2::FullMatch(sp, re);
71 RE2::PartialMatch(sp, re);
72 RE2::Consume(&sp, re);
73 sp = text; // Reset.
74 RE2::FindAndConsume(&sp, re);
75 } else {
76 // Okay, we have at least one capturing group...
77 // Try conversion for variously typed arguments.
78 StringPiece sp = text;
79 short s;
80 RE2::FullMatch(sp, re, &s);
81 long l;
82 RE2::PartialMatch(sp, re, &l);
83 float f;
84 RE2::Consume(&sp, re, &f);
85 sp = text; // Reset.
86 double d;
87 RE2::FindAndConsume(&sp, re, &d);
88 }
Paul Wankadiabfa58642016-05-24 18:36:12 +100089
Paul Wankadiaf620af72018-12-09 22:42:42 -080090 string s = string(text);
91 RE2::Replace(&s, re, "");
92 s = string(text); // Reset.
93 RE2::GlobalReplace(&s, re, "");
Paul Wankadiabfa58642016-05-24 18:36:12 +100094
Paul Wankadiaf620af72018-12-09 22:42:42 -080095 string min, max;
96 re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
Paul Wankadiabfa58642016-05-24 18:36:12 +100097
98 // Exercise some other API functionality.
Paul Wankadiaf620af72018-12-09 22:42:42 -080099 dummy += re.NamedCapturingGroups().size();
100 dummy += re.CapturingGroupNames().size();
Paul Wankadiabfa58642016-05-24 18:36:12 +1000101 dummy += RE2::QuoteMeta(pattern).size();
102}
103
104// Entry point for libFuzzer.
105extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
Paul Wankadia4c916c92018-09-03 03:48:25 -0700106 if (size == 0 || size > 999)
Paul Wankadiabfa58642016-05-24 18:36:12 +1000107 return 0;
108
Paul Wankadiaa8176122019-01-25 01:50:33 -0800109 // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
110 // Otherwise, we will waste time on inputs that have long runs of various
Paul Wankadia2c220e72017-10-28 13:15:57 +1100111 // character classes. The fuzzer has shown itself to be easily capable of
112 // generating such patterns that fall within the other limits, but result
113 // in timeouts nonetheless. The marginal cost is high - even more so when
114 // counted repetition is involved - whereas the marginal benefit is zero.
Paul Wankadiaa8176122019-01-25 01:50:33 -0800115 // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
116 int cc = 0;
Paul Wankadia15af9e42017-10-27 16:06:25 +1100117 for (size_t i = 0; i < size; i++) {
Paul Wankadiabfe29202018-09-19 01:00:45 -0700118 if (data[i] == '.')
Paul Wankadiaa8176122019-01-25 01:50:33 -0800119 cc++;
Paul Wankadia4c916c92018-09-03 03:48:25 -0700120 if (data[i] != '\\')
121 continue;
122 i++;
123 if (i >= size)
124 break;
Paul Wankadiaa8176122019-01-25 01:50:33 -0800125 if (data[i] == 'p' || data[i] == 'P' ||
126 data[i] == 'd' || data[i] == 'D' ||
127 data[i] == 's' || data[i] == 'S' ||
128 data[i] == 'w' || data[i] == 'W')
129 cc++;
Paul Wankadia15af9e42017-10-27 16:06:25 +1100130 }
Paul Wankadiaa8176122019-01-25 01:50:33 -0800131 if (cc > 9)
Paul Wankadia15af9e42017-10-27 16:06:25 +1100132 return 0;
133
Paul Wankadiabfa58642016-05-24 18:36:12 +1000134 // The one-at-a-time hash by Bob Jenkins.
135 uint32_t hash = 0;
136 for (size_t i = 0; i < size; i++) {
137 hash += data[i];
138 hash += (hash << 10);
139 hash ^= (hash >> 6);
140 }
141 hash += (hash << 3);
142 hash ^= (hash >> 11);
143 hash += (hash << 15);
144
145 RE2::Options options;
146 options.set_log_errors(false);
Paul Wankadia4be7a142017-10-13 16:29:01 +1100147 options.set_max_mem(64 << 20);
Paul Wankadiabfa58642016-05-24 18:36:12 +1000148 options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
149 : RE2::Options::EncodingUTF8);
150 options.set_posix_syntax(hash & 2);
151 options.set_longest_match(hash & 4);
152 options.set_literal(hash & 8);
153 options.set_never_nl(hash & 16);
154 options.set_dot_nl(hash & 32);
155 options.set_never_capture(hash & 64);
156 options.set_case_sensitive(hash & 128);
157 options.set_perl_classes(hash & 256);
158 options.set_word_boundary(hash & 512);
159 options.set_one_line(hash & 1024);
160
161 const char* ptr = reinterpret_cast<const char*>(data);
162 int len = static_cast<int>(size);
163
164 StringPiece pattern(ptr, len);
165 StringPiece text(ptr, len);
166 Test(pattern, options, text);
167
Paul Wankadiabfa58642016-05-24 18:36:12 +1000168 return 0;
169}