initial release
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
new file mode 100644
index 0000000..b9dcbd2
--- /dev/null
+++ b/re2/testing/backtrack.cc
@@ -0,0 +1,253 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+//
+// Prog::BadSearchBacktrack is a backtracking regular expression search,
+// except that it remembers where it has been, trading a lot of
+// memory for a lot of time. It exists only for testing purposes.
+//
+// Let me repeat that.
+//
+// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
+// - It uses a ton of memory.
+// - It uses a ton of stack.
+// - It uses CHECK and LOG(FATAL).
+// - It implements unanchored search by repeated anchored search.
+//
+// On the other hand, it is very simple and a good reference
+// implementation for the more complicated regexp packages.
+//
+// In BUILD, this file is linked into the ":testing" library,
+// not the main library, in order to make it harder to pick up
+// accidentally.
+
+#include "util/util.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Backtracker holds the state for a backtracking search.
+//
+// Excluding the search parameters, the main search state
+// is just the "capture registers", which record, for the
+// current execution, the string position at which each
+// parenthesis was passed. cap_[0] and cap_[1] are the
+// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
+//
+// To avoid infinite loops during backtracking on expressions
+// like (a*)*, the visited_[] bitmap marks the (state, string-position)
+// pairs that have already been explored and are thus not worth
+// re-exploring if we get there via another path. Modern backtracking
+// libraries engineer their program representation differently, to make
+// such infinite loops possible to avoid without keeping a giant visited_
+// bitmap, but visited_ works fine for a reference implementation
+// and it has the nice benefit of making the search run in linear time.
+class Backtracker {
+ public:
+ explicit Backtracker(Prog* prog);
+ ~Backtracker();
+
+ bool Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch);
+
+ private:
+ // Explores from instruction ip at string position p looking for a match.
+ // Returns true if found (so that caller can stop trying other possibilities).
+ bool Visit(Inst* ip, const char* p);
+
+ // Search parameters
+ Prog* prog_; // program being run
+ StringPiece text_; // text being searched
+ StringPiece context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether search must end at text.end()
+ StringPiece *submatch_; // submatches to fill in
+ int nsubmatch_; // # of submatches to fill in
+
+ // Search state
+ const char* cap_[64]; // capture registers
+ uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
+ int nvisited_; // # of words in bitmap
+};
+
+Backtracker::Backtracker(Prog* prog)
+ : prog_(prog),
+ anchored_(false),
+ longest_(false),
+ submatch_(NULL),
+ nsubmatch_(0),
+ visited_(NULL),
+ nvisited_(0) {
+}
+
+Backtracker::~Backtracker() {
+ delete[] visited_;
+}
+
+// Runs a backtracking search.
+bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch) {
+ text_ = text;
+ context_ = context;
+ if (context_.begin() == NULL)
+ context_ = text;
+ if (prog_->anchor_start() && text.begin() > context_.begin())
+ return false;
+ if (prog_->anchor_end() && text.end() < context_.end())
+ return false;
+ anchored_ = anchored | prog_->anchor_start();
+ longest_ = longest | prog_->anchor_end();
+ endmatch_ = prog_->anchor_end();
+ submatch_ = submatch;
+ nsubmatch_ = nsubmatch;
+ CHECK(2*nsubmatch_ < arraysize(cap_));
+ memset(cap_, 0, sizeof cap_);
+
+ // We use submatch_[0] for our own bookkeeping,
+ // so it had better exist.
+ StringPiece sp0;
+ if (nsubmatch < 1) {
+ submatch_ = &sp0;
+ nsubmatch_ = 1;
+ }
+ submatch_[0] = NULL;
+
+ // Allocate new visited_ bitmap -- size is proportional
+ // to text, so have to reallocate on each call to Search.
+ if (visited_)
+ delete[] visited_;
+ nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
+ visited_ = new uint32[nvisited_];
+ memset(visited_, 0, nvisited_*sizeof visited_[0]);
+
+ // Anchored search must start at text.begin().
+ if (anchored_) {
+ cap_[0] = text.begin();
+ return Visit(prog_->start(), text.begin());
+ }
+
+ // Unanchored search, starting from each possible text position.
+ // Notice that we have to try the empty string at the end of
+ // the text, so the loop condition is p <= text.end(), not p < text.end().
+ for (const char* p = text.begin(); p <= text.end(); p++) {
+ cap_[0] = p;
+ if (Visit(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
+ }
+ return false;
+}
+
+// Explores from instruction ip at string position p looking for a match.
+// Return true if found (so that caller can stop trying other possibilities).
+bool Backtracker::Visit(Inst* ip, const char* p) {
+ // Check bitmap. If we've already explored from here,
+ // either it didn't match or it did but we're hoping for a better match.
+ // Either way, don't go down that road again.
+ CHECK(p <= text_.end());
+ int n = ip->id()*(text_.size()+1) + (p - text_.begin());
+ CHECK_LT(n/32, nvisited_);
+ if (visited_[n/32] & (1 << (n&31)))
+ return false;
+ visited_[n/32] |= 1 << (n&31);
+
+ // Pick out byte at current position. If at end of string,
+ // have to explore in hope of finishing a match. Use impossible byte -1.
+ int c = -1;
+ if (p < text_.end())
+ c = *p & 0xFF;
+
+ switch (ip->opcode()) {
+ default:
+ LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
+ return false; // not reached
+
+ case kInstAlt:
+ case kInstAltMatch:
+ // Try both possible next states: out is preferred to out1.
+ if (Visit(ip->out(), p)) {
+ if (longest_)
+ Visit(ip->out1(), p);
+ return true;
+ }
+ return Visit(ip->out1(), p);
+
+ case kInstByteRange:
+ if (ip->Matches(c))
+ return Visit(ip->out(), p+1);
+ return false;
+
+ case kInstCapture:
+ if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
+ // Capture p to register, but save old value.
+ const char* q = cap_[ip->cap()];
+ cap_[ip->cap()] = p;
+ bool ret = Visit(ip->out(), p);
+ // Restore old value as we backtrack.
+ cap_[ip->cap()] = q;
+ return ret;
+ }
+ return Visit(ip->out(), p);
+
+ case kInstEmptyWidth:
+ if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+ return false;
+ return Visit(ip->out(), p);
+
+ case kInstNop:
+ return Visit(ip->out(), p);
+
+ case kInstMatch:
+ // We found a match. If it's the best so far, record the
+ // parameters in the caller's submatch_ array.
+ if (endmatch_ && p != context_.end())
+ return false;
+ cap_[1] = p;
+ if (submatch_[0].data() == NULL || // First match so far ...
+ (longest_ && p > submatch_[0].end())) { // ... or better match
+ for (int i = 0; i < nsubmatch_; i++)
+ submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
+ }
+ return true;
+
+ case kInstFail:
+ return false;
+ }
+}
+
+// Runs a backtracking search.
+bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
+ const StringPiece& context,
+ Anchor anchor,
+ MatchKind kind,
+ StringPiece* match,
+ int nmatch) {
+ // If full match, we ask for an anchored longest match
+ // and then check that match[0] == text.
+ // So make sure match[0] exists.
+ StringPiece sp0;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch < 1) {
+ match = &sp0;
+ nmatch = 1;
+ }
+ }
+
+ // Run the search.
+ Backtracker b(this);
+ bool anchored = anchor == kAnchored;
+ bool longest = kind != kFirstMatch;
+ if (!b.Search(text, context, anchored, longest, match, nmatch))
+ return false;
+ if (kind == kFullMatch && match[0].end() != text.end())
+ return false;
+ return true;
+}
+
+} // namespace re2
diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc
new file mode 100644
index 0000000..217cef8
--- /dev/null
+++ b/re2/testing/charclass_test.cc
@@ -0,0 +1,188 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test character class manipulations.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct CCTest {
+ struct {
+ Rune lo;
+ Rune hi;
+ } add[10];
+ int remove;
+ struct {
+ Rune lo;
+ Rune hi;
+ } final[10];
+};
+
+static CCTest tests[] = {
+ { { { 10, 20 }, {-1} }, -1,
+ { { 10, 20 }, {-1} } },
+
+ { { { 10, 20 }, { 20, 30 }, {-1} }, -1,
+ { { 10, 30 }, {-1} } },
+
+ { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
+ { { 10, 40 }, {-1} } },
+
+ { { { 0, 50 }, { 20, 30 }, {-1} }, -1,
+ { { 0, 50 }, {-1} } },
+
+ { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
+ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
+ { { 5, 25 }, {-1} } },
+
+ { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
+ { { 10, 23 }, {-1} } },
+
+ // These check boundary cases during negation.
+ { { { 0, Runemax }, {-1} }, -1,
+ { { 0, Runemax }, {-1} } },
+
+ { { { 0, 50 }, {-1} }, -1,
+ { { 0, 50 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, -1,
+ { { 50, Runemax }, {-1} } },
+
+ // Check RemoveAbove.
+ { { { 50, Runemax }, {-1} }, 255,
+ { { 50, 255 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, 65535,
+ { { 50, 65535 }, {-1} } },
+
+ { { { 50, Runemax }, {-1} }, Runemax,
+ { { 50, Runemax }, {-1} } },
+
+ { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
+ { { 50, 60 }, { 250, 255 }, {-1} } },
+
+ { { { 50, 60 }, {-1} }, 255,
+ { { 50, 60 }, {-1} } },
+
+ { { { 350, 360 }, {-1} }, 255,
+ { {-1} } },
+
+ { { {-1} }, 255,
+ { {-1} } },
+};
+
+static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
+ printf("\n");
+ printf("CharClass added: [%s]", desc);
+ for (int k = 0; t->add[k].lo >= 0; k++)
+ printf(" %d-%d", t->add[k].lo, t->add[k].hi);
+ printf("\n");
+ if (t->remove >= 0)
+ printf("Removed > %d\n", t->remove);
+ printf("\twant:");
+ for (int k = 0; t->final[k].lo >= 0; k++)
+ printf(" %d-%d", t->final[k].lo, t->final[k].hi);
+ printf("\n");
+
+ printf("\thave:");
+ for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+ printf(" %d-%d", it->lo, it->hi);
+ printf("\n");
+}
+
+bool ShouldContain(CCTest *t, int x) {
+ for (int j = 0; t->final[j].lo >= 0; j++)
+ if (t->final[j].lo <= x && x <= t->final[j].hi)
+ return true;
+ return false;
+}
+
+bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
+ CharClass::iterator it = cc->begin();
+ int size = 0;
+ for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
+ if (it == cc->end() ||
+ it->lo != t->final[j].lo ||
+ it->hi != t->final[j].hi) {
+ Broke(desc, t, cc);
+ return false;
+ }
+ size += it->hi - it->lo + 1;
+ }
+ if (it != cc->end()) {
+ Broke(desc, t, cc);
+ return false;
+ }
+ if (cc->size() != size) {
+ Broke(desc, t, cc);
+ printf("wrong size: want %d have %d\n", size, cc->size());
+ return false;
+ }
+
+ for (int j = 0; j < 101; j++) {
+ if (j == 100)
+ j = Runemax;
+ if (ShouldContain(t, j) != cc->Contains(j)) {
+ Broke(desc, t, cc);
+ printf("want contains(%d)=%d, got %d\n",
+ j, ShouldContain(t, j), cc->Contains(j));
+ return false;
+ }
+ }
+
+ CharClass* ncc = cc->Copy();
+ ncc->Negate();
+ for (int j = 0; j < 101; j++) {
+ if (j == 100)
+ j = Runemax;
+ if (ShouldContain(t, j) == ncc->Contains(j)) {
+ Broke(desc, t, cc);
+ printf("want ncc contains(%d)!=%d, got %d\n",
+ j, ShouldContain(t, j), ncc->Contains(j));
+ delete ncc;
+ return false;
+ }
+ if (ncc->size() != Runemax+1 - cc->size()) {
+ Broke(desc, t, cc);
+ printf("ncc size should be %d is %d\n",
+ Runemax+1 - cc->size(), ncc->size());
+ delete ncc;
+ return false;
+ }
+ }
+ delete ncc;
+ return true;
+}
+
+TEST(TestCharClass, Adds) {
+ int nfail = 0;
+ for (int i = 0; i < arraysize(tests); i++) {
+ CharClass cc;
+ CCTest* t = &tests[i];
+ for (int j = 0; t->add[j].lo >= 0; j++)
+ cc.AddRange(t->add[j].lo, t->add[j].hi);
+ if (t->remove >= 0)
+ cc.RemoveAbove(t->remove);
+ if (!CorrectCC(&cc, t, "before copy"))
+ nfail++;
+
+ CharClass *cc1 = cc.Copy();
+ if (!CorrectCC(cc1, t, "after copy"))
+ nfail++;
+ delete cc1;
+ }
+ EXPECT_EQ(nfail, 0);
+}
+
+} // namespace re2
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
new file mode 100644
index 0000000..9614983
--- /dev/null
+++ b/re2/testing/compile_test.cc
@@ -0,0 +1,171 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test prog.cc, compile.cc
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+#include "re2/prog.h"
+
+DEFINE_string(show, "", "regular expression to compile and dump");
+
+namespace re2 {
+
+// Simple input/output tests checking that
+// the regexp compiles to the expected code.
+// These are just to sanity check the basic implementation.
+// The real confidence tests happen by testing the NFA/DFA
+// that run the compiled code.
+
+struct Test {
+ const char* regexp;
+ const char* code;
+};
+
+static Test tests[] = {
+ { "a",
+ "1. byte [61-61] -> 2\n"
+ "2. match!\n" },
+ { "ab",
+ "1. byte [61-61] -> 2\n"
+ "2. byte [62-62] -> 3\n"
+ "3. match!\n" },
+ { "a|c",
+ "3. alt -> 1 | 2\n"
+ "1. byte [61-61] -> 4\n"
+ "2. byte [63-63] -> 4\n"
+ "4. match!\n" },
+ { "a|b",
+ "1. byte [61-62] -> 2\n"
+ "2. match!\n" },
+ { "[ab]",
+ "1. byte [61-62] -> 2\n"
+ "2. match!\n" },
+ { "a+",
+ "1. byte [61-61] -> 2\n"
+ "2. alt -> 1 | 3\n"
+ "3. match!\n" },
+ { "a+?",
+ "1. byte [61-61] -> 2\n"
+ "2. alt -> 3 | 1\n"
+ "3. match!\n" },
+ { "a*",
+ "2. alt -> 1 | 3\n"
+ "1. byte [61-61] -> 2\n"
+ "3. match!\n" },
+ { "a*?",
+ "2. alt -> 3 | 1\n"
+ "3. match!\n"
+ "1. byte [61-61] -> 2\n" },
+ { "a?",
+ "2. alt -> 1 | 3\n"
+ "1. byte [61-61] -> 3\n"
+ "3. match!\n" },
+ { "a??",
+ "2. alt -> 3 | 1\n"
+ "3. match!\n"
+ "1. byte [61-61] -> 3\n" },
+ { "a{4}",
+ "1. byte [61-61] -> 2\n"
+ "2. byte [61-61] -> 3\n"
+ "3. byte [61-61] -> 4\n"
+ "4. byte [61-61] -> 5\n"
+ "5. match!\n" },
+ { "(a)",
+ "2. capture 2 -> 1\n"
+ "1. byte [61-61] -> 3\n"
+ "3. capture 3 -> 4\n"
+ "4. match!\n" },
+ { "(?:a)",
+ "1. byte [61-61] -> 2\n"
+ "2. match!\n" },
+ { "",
+ "2. match!\n" },
+ { ".",
+ "3. alt -> 1 | 2\n"
+ "1. byte [00-09] -> 4\n"
+ "2. byte [0b-ff] -> 4\n"
+ "4. match!\n" },
+ { "[^ab]",
+ "5. alt -> 3 | 4\n"
+ "3. alt -> 1 | 2\n"
+ "4. byte [63-ff] -> 6\n"
+ "1. byte [00-09] -> 6\n"
+ "2. byte [0b-60] -> 6\n"
+ "6. match!\n" },
+ { "[Aa]",
+ "1. byte/i [61-61] -> 2\n"
+ "2. match!\n" },
+};
+
+TEST(TestRegexpCompileToProg, Simple) {
+ int failed = 0;
+ for (int i = 0; i < arraysize(tests); i++) {
+ const re2::Test& t = tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
+ if (re == NULL) {
+ LOG(ERROR) << "Cannot parse: " << t.regexp;
+ failed++;
+ continue;
+ }
+ Prog* prog = re->CompileToProg(0);
+ if (prog == NULL) {
+ LOG(ERROR) << "Cannot compile: " << t.regexp;
+ re->Decref();
+ failed++;
+ continue;
+ }
+ CHECK(re->CompileToProg(1) == NULL);
+ string s = prog->Dump();
+ if (s != t.code) {
+ LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
+ LOG(ERROR) << "Want:\n" << t.code;
+ LOG(ERROR) << "Got:\n" << s;
+ failed++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(failed, 0);
+}
+
+// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+// Once, erroneously split between 0x3f and 0x40 because it is
+// a 6-bit boundary.
+static struct UTF8ByteRange {
+ int lo;
+ int hi;
+} utf8ranges[] = {
+ { 0x00, 0x09 },
+ { 0x0A, 0x0A },
+ { 0x10, 0x7F },
+ { 0x80, 0x8F },
+ { 0x90, 0x9F },
+ { 0xA0, 0xBF },
+ { 0xC0, 0xC1 },
+ { 0xC2, 0xDF },
+ { 0xE0, 0xE0 },
+ { 0xE1, 0xEF },
+ { 0xF0, 0xF0 },
+ { 0xF1, 0xF3 },
+ { 0xF4, 0xF4 },
+ { 0xF5, 0xFF },
+};
+
+TEST(TestCompile, ByteRanges) {
+ Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
+ for (int i = 0; i < arraysize(utf8ranges); i++)
+ for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
+ EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
+ delete prog;
+ re->Decref();
+}
+
+} // namespace re2
diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc
new file mode 100644
index 0000000..ab1261e
--- /dev/null
+++ b/re2/testing/dfa_test.cc
@@ -0,0 +1,347 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "util/thread.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+DECLARE_bool(re2_dfa_bail_when_slow);
+
+// These parameters take 3s (opt) to run to completion
+// on a 3.6GHz Xeon. They repeatedly make the old unlocked
+// grep crash. In fact the old DFA crashes with just
+// 1 or 2 repetitions, so 10 is plenty of a safety margin.
+// Have to drop the dbg settings because dbg is very slow.
+DEFINE_int32(size, 8, "log2(number of DFA nodes)");
+DEFINE_int32(repeat, DEBUG_MODE ? 2 : 10, "Repetition count.");
+DEFINE_int32(threads, DEBUG_MODE ? 2 : 8, "number of threads");
+
+namespace re2 {
+
+// Check that multithreaded access to DFA class works.
+
+// Helper thread: builds entire DFA for prog.
+class BuildThread : public Thread {
+ public:
+ BuildThread(Prog* prog) : prog_(prog) {}
+ virtual void Run() {
+ CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
+ }
+
+ private:
+ Prog* prog_;
+};
+
+TEST(Multithreaded, BuildEntireDFA) {
+ // Create regexp with 2^FLAGS_size states in DFA.
+ string s = "a";
+ for (int i = 0; i < FLAGS_size; i++)
+ s += "[ab]";
+ s += "b";
+
+ // Check that single-threaded code works.
+ {
+ //LOG(INFO) << s;
+ Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ BuildThread* t = new BuildThread(prog);
+ t->SetJoinable(true);
+ t->Start();
+ t->Join();
+ delete t;
+ delete prog;
+ re->Decref();
+ }
+
+ // Build the DFA simultaneously in a bunch of threads.
+ for (int i = 0; i < FLAGS_repeat; i++) {
+ Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+
+ vector<BuildThread*> threads;
+ for (int j = 0; j < FLAGS_threads; j++) {
+ BuildThread *t = new BuildThread(prog);
+ t->SetJoinable(true);
+ threads.push_back(t);
+ }
+ for (int j = 0; j < FLAGS_threads; j++)
+ threads[j]->Start();
+ for (int j = 0; j < FLAGS_threads; j++) {
+ threads[j]->Join();
+ delete threads[j];
+ }
+
+ // One more compile, to make sure everything is okay.
+ prog->BuildEntireDFA(Prog::kFirstMatch);
+ delete prog;
+ re->Decref();
+ }
+}
+
+// Check that DFA size requirements are followed.
+// BuildEntireDFA will, like SearchDFA, stop building out
+// the DFA once the memory limits are reached.
+TEST(SingleThreaded, BuildEntireDFA) {
+ // Create regexp with 2^30 states in DFA.
+ string s = "a";
+ for (int i = 0; i < 30; i++)
+ s += "[ab]";
+ s += "b";
+
+ //LOG(INFO) << s;
+ Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
+ CHECK(re);
+ for (int i = 17; i < 28; i++) {
+ int limit = 1<<i;
+ int usage, progusage, dfamem;
+ {
+ testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+ Prog* prog = re->CompileToProg(limit);
+ CHECK(prog);
+ progusage = m.HeapGrowth();
+ dfamem = prog->dfa_mem();
+ prog->BuildEntireDFA(Prog::kFirstMatch);
+ prog->BuildEntireDFA(Prog::kLongestMatch);
+ usage = m.HeapGrowth();
+ delete prog;
+ }
+ if (!UsingMallocCounter)
+ continue;
+ //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
+ // limit, progusage, dfamem, usage);
+ CHECK_GT(usage, limit*9/10);
+ CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
+ }
+ re->Decref();
+}
+
+// Generates and returns a string over binary alphabet {0,1} that contains
+// all possible binary sequences of length n as subsequences. The obvious
+// brute force method would generate a string of length n * 2^n, but this
+// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
+// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
+// Such a string is useful for testing a DFA. If you have a DFA
+// where distinct last n bytes implies distinct states, then running on a
+// DeBruijn string causes the DFA to need to create a new state at every
+// position in the input, never reusing any states until it gets to the
+// end of the string. This is the worst possible case for DFA execution.
+static string DeBruijnString(int n) {
+ CHECK_LT(n, 8*sizeof(int));
+ CHECK_GT(n, 0);
+
+ vector<bool> did(1<<n);
+ for (int i = 0; i < 1<<n; i++)
+ did[i] = false;
+
+ string s;
+ for (int i = 0; i < n-1; i++)
+ s.append("0");
+ int bits = 0;
+ int mask = (1<<n) - 1;
+ for (int i = 0; i < (1<<n); i++) {
+ bits <<= 1;
+ bits &= mask;
+ if (!did[bits|1]) {
+ bits |= 1;
+ s.append("1");
+ } else {
+ s.append("0");
+ }
+ CHECK(!did[bits]);
+ did[bits] = true;
+ }
+ return s;
+}
+
+// Test that the DFA gets the right result even if it runs
+// out of memory during a search. The regular expression
+// 0[01]{n}$ matches a binary string of 0s and 1s only if
+// the (n+1)th-to-last character is a 0. Matching this in
+// a single forward pass (as done by the DFA) requires
+// keeping one bit for each of the last n+1 characters
+// (whether each was a 0), or 2^(n+1) possible states.
+// If we run this regexp to search in a string that contains
+// every possible n-character binary string as a substring,
+// then it will have to run through at least 2^n states.
+// States are big data structures -- certainly more than 1 byte --
+// so if the DFA can search correctly while staying within a
+// 2^n byte limit, it must be handling out-of-memory conditions
+// gracefully.
+TEST(SingleThreaded, SearchDFA) {
+ // Choice of n is mostly arbitrary, except that:
+ // * making n too big makes the test run for too long.
+ // * making n too small makes the DFA refuse to run,
+ // because it has so little memory compared to the program size.
+ // Empirically, n = 18 is a good compromise between the two.
+ const int n = 18;
+
+ Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+ Regexp::LikePerl, NULL);
+ CHECK(re);
+
+ // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
+ // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
+ string no_match = DeBruijnString(n);
+ string match = no_match + "0";
+
+ // The De Bruijn string is the worst case input for this regexp.
+ // By default, the DFA will notice that it is flushing its cache
+ // too frequently and will bail out early, so that RE2 can use the
+ // NFA implementation instead. (The DFA loses its speed advantage
+ // if it can't get a good cache hit rate.)
+ // Tell the DFA to trudge along instead.
+ FLAGS_re2_dfa_bail_when_slow = false;
+
+ int64 usage;
+ int64 peak_usage;
+ {
+ testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
+ Prog* prog = re->CompileToProg(1<<n);
+ CHECK(prog);
+ for (int i = 0; i < 10; i++) {
+ bool matched, failed = false;
+ matched = prog->SearchDFA(match, NULL,
+ Prog::kUnanchored, Prog::kFirstMatch,
+ NULL, &failed);
+ CHECK(!failed);
+ CHECK(matched);
+ matched = prog->SearchDFA(no_match, NULL,
+ Prog::kUnanchored, Prog::kFirstMatch,
+ NULL, &failed);
+ CHECK(!failed);
+ CHECK(!matched);
+ }
+ usage = m.HeapGrowth();
+ peak_usage = m.PeakHeapGrowth();
+ delete prog;
+ }
+ re->Decref();
+
+ if (!UsingMallocCounter)
+ return;
+ //LOG(INFO) << "usage " << usage << " " << peak_usage;
+ CHECK_LT(usage, 1<<n);
+ CHECK_LT(peak_usage, 1<<n);
+}
+
+// Helper thread: searches for match, which should match,
+// and no_match, which should not.
+class SearchThread : public Thread {
+ public:
+ SearchThread(Prog* prog, const StringPiece& match,
+ const StringPiece& no_match)
+ : prog_(prog), match_(match), no_match_(no_match) {}
+
+ virtual void Run() {
+ for (int i = 0; i < 2; i++) {
+ bool matched, failed = false;
+ matched = prog_->SearchDFA(match_, NULL,
+ Prog::kUnanchored, Prog::kFirstMatch,
+ NULL, &failed);
+ CHECK(!failed);
+ CHECK(matched);
+ matched = prog_->SearchDFA(no_match_, NULL,
+ Prog::kUnanchored, Prog::kFirstMatch,
+ NULL, &failed);
+ CHECK(!failed);
+ CHECK(!matched);
+ }
+ }
+
+ private:
+ Prog* prog_;
+ StringPiece match_;
+ StringPiece no_match_;
+};
+
+TEST(Multithreaded, SearchDFA) {
+ // Same as single-threaded test above.
+ const int n = 18;
+ Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+ Regexp::LikePerl, NULL);
+ CHECK(re);
+ string no_match = DeBruijnString(n);
+ string match = no_match + "0";
+ FLAGS_re2_dfa_bail_when_slow = false;
+
+ // Check that single-threaded code works.
+ {
+ Prog* prog = re->CompileToProg(1<<n);
+ CHECK(prog);
+ SearchThread* t = new SearchThread(prog, match, no_match);
+ t->SetJoinable(true);
+ t->Start();
+ t->Join();
+ delete t;
+ delete prog;
+ }
+
+ // Run the search simultaneously in a bunch of threads.
+ // Reuse same flags for Multithreaded.BuildDFA above.
+ for (int i = 0; i < FLAGS_repeat; i++) {
+ //LOG(INFO) << "Search " << i;
+ Prog* prog = re->CompileToProg(1<<n);
+ CHECK(prog);
+
+ vector<SearchThread*> threads;
+ for (int j = 0; j < FLAGS_threads; j++) {
+ SearchThread *t = new SearchThread(prog, match, no_match);
+ t->SetJoinable(true);
+ threads.push_back(t);
+ }
+ for (int j = 0; j < FLAGS_threads; j++)
+ threads[j]->Start();
+ for (int j = 0; j < FLAGS_threads; j++) {
+ threads[j]->Join();
+ delete threads[j];
+ }
+ delete prog;
+ }
+ re->Decref();
+}
+
+struct ReverseTest {
+ const char *regexp;
+ const char *text;
+ bool match;
+};
+
+// Test that reverse DFA handles anchored/unanchored correctly.
+// It's in the DFA interface but not used by RE2.
+ReverseTest reverse_tests[] = {
+ { "\\A(a|b)", "abc", true },
+ { "(a|b)\\z", "cba", true },
+ { "\\A(a|b)", "cba", false },
+ { "(a|b)\\z", "abc", false },
+};
+
+TEST(DFA, ReverseMatch) {
+ int nfail = 0;
+ for (int i = 0; i < arraysize(reverse_tests); i++) {
+ const ReverseTest& t = reverse_tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog *prog = re->CompileToReverseProg(0);
+ CHECK(prog);
+ bool failed = false;
+ bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed);
+ if (matched != t.match) {
+ LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
+ nfail++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(nfail, 0);
+}
+
+} // namespace re2
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
new file mode 100644
index 0000000..27c85f5
--- /dev/null
+++ b/re2/testing/dump.cc
@@ -0,0 +1,164 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Dump the regexp into a string showing structure.
+// Tested by parse_unittest.cc
+
+// This function traverses the regexp recursively,
+// meaning that on inputs like Regexp::Simplify of
+// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
+// it takes time and space exponential in the size of the
+// original regular expression. It can also use stack space
+// linear in the size of the regular expression for inputs
+// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
+// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
+// As a result, Dump is provided only in the testing
+// library (see BUILD).
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/stringpiece.h"
+#include "re2/regexp.h"
+
+// Cause a link error if this file is used outside of testing.
+DECLARE_string(test_tmpdir);
+
+namespace re2 {
+
+static const char* kOpcodeNames[] = {
+ "bad",
+ "no",
+ "emp",
+ "lit",
+ "str",
+ "cat",
+ "alt",
+ "star",
+ "plus",
+ "que",
+ "rep",
+ "cap",
+ "dot",
+ "byte",
+ "bol",
+ "eol",
+ "wb", // kRegexpWordBoundary
+ "nwb", // kRegexpNoWordBoundary
+ "bot",
+ "eot",
+ "cc",
+};
+
+// Create string representation of regexp with explicit structure.
+// Nothing pretty, just for testing.
+static void DumpRegexpAppending(Regexp* re, string* s) {
+ if (re->op() < 0 || re->op() > kMaxRegexpOp) {
+ StringAppendF(s, "op%d", re->op());
+ } else {
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ if (re->parse_flags() & Regexp::NonGreedy)
+ s->append("n");
+ break;
+ }
+ s->append(kOpcodeNames[re->op()]);
+ if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
+ Rune r = re->rune();
+ if ('a' <= r && r <= 'z')
+ s->append("fold");
+ }
+ if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
+ for (int i = 0; i < re->nrunes(); i++) {
+ Rune r = re->runes()[i];
+ if ('a' <= r && r <= 'z') {
+ s->append("fold");
+ break;
+ }
+ }
+ }
+ }
+ s->append("{");
+ switch (re->op()) {
+ default:
+ break;
+ case kRegexpLiteral: {
+ Rune r = re->rune();
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ break;
+ }
+ case kRegexpLiteralString:
+ for (int i = 0; i < re->nrunes(); i++) {
+ Rune r = re->runes()[i];
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
+ break;
+ case kRegexpConcat:
+ case kRegexpAlternate:
+ for (int i = 0; i < re->nsub(); i++)
+ DumpRegexpAppending(re->sub()[i], s);
+ break;
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpCapture:
+ if (re->name()) {
+ s->append(*re->name());
+ s->append(":");
+ }
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpRepeat:
+ s->append(StringPrintf("%d,%d ", re->min(), re->max()));
+ DumpRegexpAppending(re->sub()[0], s);
+ break;
+ case kRegexpAnyChar:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ break;
+ case kRegexpCharClass: {
+ string sep;
+ for (CharClass::iterator it = re->cc()->begin();
+ it != re->cc()->end(); ++it) {
+ RuneRange rr = *it;
+ s->append(sep);
+ if (rr.lo == rr.hi)
+ s->append(StringPrintf("%#x", rr.lo));
+ else
+ s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
+ sep = " ";
+ }
+ break;
+ }
+ }
+ s->append("}");
+}
+
+string Regexp::Dump() {
+ string s;
+
+ // Make sure being called from a unit test.
+ if (FLAGS_test_tmpdir.empty()) {
+ LOG(ERROR) << "Cannot use except for testing.";
+ return s;
+ }
+
+ DumpRegexpAppending(this, &s);
+ return s;
+}
+
+} // namespace re2
diff --git a/re2/testing/exhaustive1_test.cc b/re2/testing/exhaustive1_test.cc
new file mode 100644
index 0000000..9b76b57
--- /dev/null
+++ b/re2/testing/exhaustive1_test.cc
@@ -0,0 +1,42 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DECLARE_string(regexp_engines);
+
+namespace re2 {
+
+// Test simple repetition operators
+TEST(Repetition, Simple) {
+ vector<string> ops = Split(" ",
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+ "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+ "%s* %s+ %s? %s*? %s+? %s??");
+ ExhaustiveTest(3, 2, Explode("abc."), ops,
+ 6, Explode("ab"), "(?:%s)", "");
+ ExhaustiveTest(3, 2, Explode("abc."), ops,
+ 40, Explode("a"), "(?:%s)", "");
+}
+
+// Test capturing parens -- (a) -- inside repetition operators
+TEST(Repetition, Capturing) {
+ vector<string> ops = Split(" ",
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
+ "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
+ "%s* %s+ %s? %s*? %s+? %s??");
+ ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
+ 7, Explode("ab"), "(?:%s)", "");
+
+ // This would be a great test, but it runs forever when PCRE is enabled.
+ if (strstr("PCRE", FLAGS_regexp_engines.c_str()))
+ ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
+ 100, Explode("a"), "(?:%s)", "");
+}
+
+} // namespace re2
+
diff --git a/re2/testing/exhaustive2_test.cc b/re2/testing/exhaustive2_test.cc
new file mode 100644
index 0000000..c5fec5b
--- /dev/null
+++ b/re2/testing/exhaustive2_test.cc
@@ -0,0 +1,70 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/re2.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DECLARE_string(regexp_engines);
+
+namespace re2 {
+
+// Test empty string matches (aka "(?:)")
+TEST(EmptyString, Exhaustive) {
+ ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
+ RegexpGenerator::EgrepOps(),
+ 5, Split("", "ab"), "", "");
+}
+
+// Test escaped versions of regexp syntax.
+TEST(Punctuation, Literals) {
+ vector<string> alphabet = Explode("()*+?{}[]\\^$.");
+ vector<string> escaped = alphabet;
+ for (int i = 0; i < escaped.size(); i++)
+ escaped[i] = "\\" + escaped[i];
+ ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
+ 2, alphabet, "", "");
+}
+
+// Test ^ $ . \A \z in presence of line endings.
+// Have to wrap the empty-width ones in (?:) so that
+// they can be repeated -- PCRE rejects ^* but allows (?:^)*
+TEST(LineEnds, Exhaustive) {
+ ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
+ RegexpGenerator::EgrepOps(),
+ 4, Explode("ab\n"), "", "");
+}
+
+// Test what does and does not match \n.
+// This would be a good test, except that PCRE seems to have a bug:
+// in single-byte character set mode (the default),
+// [^a] matches \n, but in UTF-8 mode it does not.
+// So when we run the test, the tester complains that
+// we don't agree with PCRE, but it's PCRE that is at fault.
+// For what it's worth, Perl gets this right (matches
+// regardless of whether UTF-8 input is selected):
+//
+// #!/usr/bin/perl
+// use POSIX qw(locale_h);
+// print "matches in latin1\n" if "\n" =~ /[^a]/;
+// setlocale("en_US.utf8");
+// print "matches in utf8\n" if "\n" =~ /[^a]/;
+//
+// The rule chosen for RE2 is that by default, like Perl,
+// dot does not match \n but negated character classes [^a] do.
+// (?s) will allow dot to match \n; there is no way in RE2
+// to stop [^a] from matching \n, though the underlying library
+// provides a mechanism, and RE2 could add new syntax if needed.
+//
+// TEST(Newlines, Exhaustive) {
+// vector<string> empty_vector;
+// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
+// RegexpGenerator::EgrepOps(),
+// 4, Explode("a\n"), "");
+// }
+
+} // namespace re2
+
diff --git a/re2/testing/exhaustive3_test.cc b/re2/testing/exhaustive3_test.cc
new file mode 100644
index 0000000..5613fcb
--- /dev/null
+++ b/re2/testing/exhaustive3_test.cc
@@ -0,0 +1,94 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+// Test simple character classes by themselves.
+TEST(CharacterClasses, Exhaustive) {
+ vector<string> atoms = Split(" ",
+ "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+ ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+ 5, Explode("ab"), "", "");
+}
+
+// Test simple character classes inside a___b (for example, a[a]b).
+TEST(CharacterClasses, ExhaustiveAB) {
+ vector<string> atoms = Split(" ",
+ "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
+ ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
+ 5, Explode("ab"), "a%sb", "");
+}
+
+// Returns UTF8 for Rune r
+static string UTF8(Rune r) {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ return string(buf);
+}
+
+// Returns a vector of "interesting" UTF8 characters.
+// Unicode is now too big to just return all of them,
+// so UTF8Characters return a set likely to be good test cases.
+static const vector<string>& InterestingUTF8() {
+ static bool init;
+ static vector<string> v;
+
+ if (init)
+ return v;
+
+ init = true;
+ // All the Latin1 equivalents are interesting.
+ for (int i = 1; i < 256; i++)
+ v.push_back(UTF8(i));
+
+ // After that, the codes near bit boundaries are
+ // interesting, because they span byte sequence lengths.
+ for (int j = 0; j < 8; j++)
+ v.push_back(UTF8(256 + j));
+ for (int i = 512; i < Runemax; i <<= 1)
+ for (int j = -8; j < 8; j++)
+ v.push_back(UTF8(i + j));
+
+ // The codes near Runemax, including Runemax itself, are interesting.
+ for (int j = -8; j <= 0; j++)
+ v.push_back(UTF8(Runemax + j));
+
+ return v;
+}
+
+// Test interesting UTF-8 characters against character classes.
+TEST(InterestingUTF8, SingleOps) {
+ vector<string> atoms = Split(" ",
+ ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+ "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+ "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+ "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+ vector<string> ops; // no ops
+ ExhaustiveTest(1, 0, atoms, ops,
+ 1, InterestingUTF8(), "", "");
+}
+
+// Test interesting UTF-8 characters against character classes,
+// but wrap everything inside AB.
+TEST(InterestingUTF8, AB) {
+ vector<string> atoms = Split(" ",
+ ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
+ "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
+ "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
+ "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
+ vector<string> ops; // no ops
+ vector<string> alpha = InterestingUTF8();
+ for (int i = 0; i < alpha.size(); i++)
+ alpha[i] = "a" + alpha[i] + "b";
+ ExhaustiveTest(1, 0, atoms, ops,
+ 1, alpha, "a%sb", "");
+}
+
+} // namespace re2
+
diff --git a/re2/testing/exhaustive_test.cc b/re2/testing/exhaustive_test.cc
new file mode 100644
index 0000000..fc40dee
--- /dev/null
+++ b/re2/testing/exhaustive_test.cc
@@ -0,0 +1,38 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+namespace re2 {
+
+DECLARE_string(regexp_engines);
+
+// Test very simple expressions.
+TEST(EgrepLiterals, Lowercase) {
+ EgrepTest(3, 2, "abc.", 3, "abc", "");
+}
+
+// Test mixed-case expressions.
+TEST(EgrepLiterals, MixedCase) {
+ EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
+}
+
+// Test mixed-case in case-insensitive mode.
+TEST(EgrepLiterals, FoldCase) {
+ // The punctuation characters surround A-Z and a-z
+ // in the ASCII table. This looks for bugs in the
+ // bytemap range code in the DFA.
+ EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
+}
+
+// Test very simple expressions.
+TEST(EgrepLiterals, UTF8) {
+ EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
+}
+
+} // namespace re2
+
diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc
new file mode 100644
index 0000000..c8b9b59
--- /dev/null
+++ b/re2/testing/exhaustive_tester.cc
@@ -0,0 +1,109 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Exhaustive testing of regular expression matching.
+
+// Each test picks an alphabet (e.g., "abc"), a maximum string length,
+// a maximum regular expression length, and a maximum number of letters
+// that can appear in the regular expression. Given these parameters,
+// it tries every possible regular expression and string, verifying that
+// the NFA, DFA, and a trivial backtracking implementation agree about
+// the location of the match.
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+#include "re2/testing/tester.h"
+
+DEFINE_bool(show_regexps, false, "show regexps during testing");
+
+DEFINE_int32(max_bad_regexp_inputs, 1,
+ "Stop testing a regular expression after finding this many "
+ "strings that break it.");
+
+// Compiled in debug mode, the usual tests run for over an hour.
+// Have to cut it down to make the unit test machines happy.
+DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
+
+namespace re2 {
+
+// Processes a single generated regexp.
+// Compiles it using Regexp interface and PCRE, and then
+// checks that NFA, DFA, and PCRE all return the same results.
+void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
+ regexps_++;
+
+ string regexp = const_regexp;
+ if (!topwrapper_.empty())
+ regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
+
+ if (FLAGS_show_regexps) {
+ printf("\r%s", regexp.c_str());
+ fflush(stdout);
+ }
+
+ Tester tester(regexp);
+ if (tester.error())
+ return;
+
+ strgen_.Reset();
+ strgen_.GenerateNULL();
+ if (randomstrings_)
+ strgen_.Random(stringseed_, stringcount_);
+ int bad_inputs = 0;
+ while (strgen_.HasNext()) {
+ tests_++;
+ if (!tester.TestInput(strgen_.Next())) {
+ failures_++;
+ if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
+ break;
+ }
+ }
+}
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+ const vector<string>& alphabet,
+ const vector<string>& ops,
+ int maxstrlen, const vector<string>& stralphabet,
+ const string& wrapper,
+ const string& topwrapper) {
+ if (DEBUG_MODE && FLAGS_quick_debug_mode) {
+ if (maxatoms > 1)
+ maxatoms--;
+ if (maxops > 1)
+ maxops--;
+ if (maxstrlen > 1)
+ maxstrlen--;
+ }
+ ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+ maxstrlen, stralphabet, wrapper,
+ topwrapper);
+ t.Generate();
+ printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+ EXPECT_EQ(0, t.failures());
+}
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const string& alphabet,
+ int maxstrlen, const string& stralphabet,
+ const string& wrapper) {
+ const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
+
+ for (int i = 0; i < arraysize(tops); i++) {
+ ExhaustiveTest(maxatoms, maxops,
+ Split("", alphabet),
+ RegexpGenerator::EgrepOps(),
+ maxstrlen,
+ Split("", stralphabet),
+ wrapper,
+ tops[i]);
+ }
+}
+
+} // namespace re2
diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h
new file mode 100644
index 0000000..38a139f
--- /dev/null
+++ b/re2/testing/exhaustive_tester.h
@@ -0,0 +1,85 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
+#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
+
+#include <string>
+#include <vector>
+#include "util/util.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Exhaustive regular expression test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that NFA, DFA, and PCRE agree about whether each regexp matches
+// each possible string, and if so, where the match is.
+//
+// Can also be used in a "random" mode that generates a given number
+// of random regexp and strings, allowing testing of larger expressions
+// and inputs.
+class ExhaustiveTester : public RegexpGenerator {
+ public:
+ ExhaustiveTester(int maxatoms,
+ int maxops,
+ const vector<string>& alphabet,
+ const vector<string>& ops,
+ int maxstrlen,
+ const vector<string>& stralphabet,
+ const string& wrapper,
+ const string& topwrapper)
+ : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+ strgen_(maxstrlen, stralphabet),
+ wrapper_(wrapper),
+ topwrapper_(topwrapper),
+ regexps_(0), tests_(0), failures_(0),
+ randomstrings_(0), stringseed_(0), stringcount_(0) { }
+
+ int regexps() { return regexps_; }
+ int tests() { return tests_; }
+ int failures() { return failures_; }
+
+ // Needed for RegexpGenerator interface.
+ void HandleRegexp(const string& regexp);
+
+ // Causes testing to generate random input strings.
+ void RandomStrings(int32 seed, int32 count) {
+ randomstrings_ = true;
+ stringseed_ = seed;
+ stringcount_ = count;
+ }
+
+ private:
+ StringGenerator strgen_;
+ string wrapper_; // Regexp wrapper - either empty or has one %s.
+ string topwrapper_; // Regexp top-level wrapper.
+ int regexps_; // Number of HandleRegexp calls
+ int tests_; // Number of regexp tests.
+ int failures_; // Number of tests failed.
+
+ bool randomstrings_; // Whether to use random strings
+ int32 stringseed_; // If so, the seed.
+ int stringcount_; // If so, how many to generate.
+ DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
+};
+
+// Runs an exhaustive test on the given parameters.
+void ExhaustiveTest(int maxatoms, int maxops,
+ const vector<string>& alphabet,
+ const vector<string>& ops,
+ int maxstrlen, const vector<string>& stralphabet,
+ const string& wrapper,
+ const string& topwrapper);
+
+// Runs an exhaustive test using the given parameters and
+// the basic egrep operators.
+void EgrepTest(int maxatoms, int maxops, const string& alphabet,
+ int maxstrlen, const string& stralphabet,
+ const string& wrapper);
+
+} // namespace re2
+
+#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
new file mode 100644
index 0000000..e17bd14
--- /dev/null
+++ b/re2/testing/filtered_re2_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+
+DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
+
+namespace re2 {
+
+struct FilterTestVars {
+ vector<string> atoms;
+ vector<int> atom_indices;
+ vector<int> matches;
+ RE2::Options opts;
+ FilteredRE2 f;
+};
+
+TEST(FilteredRE2Test, EmptyTest) {
+ FilterTestVars v;
+ v.f.AllMatches("foo", v.atom_indices, &v.matches);
+ EXPECT_EQ(0, v.matches.size());
+}
+
+TEST(FilteredRE2Test, SmallOrTest) {
+ FLAGS_filtered_re2_min_atom_len = 4;
+
+ FilterTestVars v;
+ int id;
+ v.f.Add("(foo|bar)", v.opts, &id);
+
+ v.f.Compile(&v.atoms);
+ EXPECT_EQ(0, v.atoms.size());
+
+ v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
+ EXPECT_EQ(1, v.matches.size());
+ EXPECT_EQ(id, v.matches[0]);
+}
+
+struct AtomTest {
+ const char* testname;
+ // If any test needs more than this many regexps or atoms, increase
+ // the size of the corresponding array.
+ const char* regexps[20];
+ const char* atoms[20];
+};
+
+AtomTest atom_tests[] = {
+ {
+ // This test checks to make sure empty patterns are allowed.
+ "CheckEmptyPattern",
+ {""},
+ {}
+ }, {
+ // This test checks that all atoms of length greater than min length
+ // are found, and no atoms that are of smaller length are found.
+ "AllAtomsGtMinLengthFound", {
+ "(abc123|def456|ghi789).*mnop[x-z]+",
+ "abc..yyy..zz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc123",
+ "def456",
+ "ghi789",
+ "mnop",
+ "abc",
+ "yyy",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test to make sure that any atoms that have another atom as a
+ // substring in an OR are removed; that is, only the shortest
+ // substring is kept.
+ "SubstrAtomRemovesSuperStrInOr", {
+ "(abc123|abc|ghi789|abc1234).*[x-z]+",
+ "abcd..yyy..yyyzzz",
+ "mnmnpp[a-z]+PPP"
+ }, {
+ "abc",
+ "ghi789",
+ "abcd",
+ "yyy",
+ "yyyzzz",
+ "mnmnpp",
+ "ppp"
+ }
+ }, {
+ // Test character class expansion.
+ "CharClassExpansion", {
+ "m[a-c][d-f]n.*[x-z]+",
+ "[x-y]bcde[ab]"
+ }, {
+ "madn", "maen", "mafn",
+ "mbdn", "mben", "mbfn",
+ "mcdn", "mcen", "mcfn",
+ "xbcdea", "xbcdeb",
+ "ybcdea", "ybcdeb"
+ }
+ }
+};
+
+void AddRegexpsAndCompile(const char* regexps[],
+ int n,
+ struct FilterTestVars* v) {
+ v->opts.set_utf8(false);
+ for (int i = 0; i < n; i++) {
+ int id;
+ v->f.Add(regexps[i], v->opts, &id);
+ }
+ v->f.Compile(&v->atoms);
+}
+
+bool CheckExpectedAtoms(const char* atoms[],
+ int n,
+ const char* testname,
+ struct FilterTestVars* v) {
+ vector<string> expected;
+ for (int i = 0; i < n; i++)
+ expected.push_back(atoms[i]);
+
+ bool pass = expected.size() == v->atoms.size();
+
+ sort(v->atoms.begin(), v->atoms.end());
+ sort(expected.begin(), expected.end());
+ for (int i = 0; pass && i < n; i++)
+ pass = pass && expected[i] == v->atoms[i];
+
+ if (!pass) {
+ LOG(WARNING) << "Failed " << testname;
+ LOG(WARNING) << "Expected #atoms = " << expected.size();
+ for (int i = 0; i < expected.size(); i++)
+ LOG(WARNING) << expected[i];
+ LOG(WARNING) << "Found #atoms = " << v->atoms.size();
+ for (int i = 0; i < v->atoms.size(); i++)
+ LOG(WARNING) << v->atoms[i];
+ }
+
+ return pass;
+}
+
+TEST(FilteredRE2Test, AtomTests) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ int nfail = 0;
+ for (int i = 0; i < arraysize(atom_tests); i++) {
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[i];
+ int natom, nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ for (natom = 0; natom < arraysize(t->atoms); natom++)
+ if (t->atoms[natom] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
+ nfail++;
+ }
+ EXPECT_EQ(0, nfail);
+}
+
+void FindAtomIndices(const vector<string> atoms,
+ const vector<string> matched_atoms,
+ vector<int>* atom_indices) {
+ atom_indices->clear();
+ for (int i = 0; i < matched_atoms.size(); i++) {
+ int j = 0;
+ for (; j < atoms.size(); j++) {
+ if (matched_atoms[i] == atoms[j]) {
+ atom_indices->push_back(j);
+ break;
+ }
+ EXPECT_LT(j, atoms.size());
+ }
+ }
+}
+
+TEST(FilteredRE2Test, MatchEmptyPattern) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[0];
+ // We are using the regexps used in one of the atom tests
+ // for this test. Adding the EXPECT here to make sure
+ // the index we use for the test is for the correct test.
+ EXPECT_EQ("CheckEmptyPattern", t->testname);
+ int nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+ string text = "0123";
+ vector<int> atom_ids;
+ vector<int> matching_regexps;
+ EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
+}
+
+TEST(FilteredRE2Test, MatchTests) {
+ FLAGS_filtered_re2_min_atom_len = 3;
+
+ FilterTestVars v;
+ AtomTest* t = &atom_tests[2];
+ // We are using the regexps used in one of the atom tests
+ // for this test.
+ EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", t->testname);
+ int nregexp;
+ for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ if (t->regexps[nregexp] == NULL)
+ break;
+ AddRegexpsAndCompile(t->regexps, nregexp, &v);
+
+ string text = "abc121212xyz";
+ // atoms = abc
+ vector<int> atom_ids;
+ vector<string> atoms;
+ atoms.push_back("abc");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ vector<int> matching_regexps;
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abc12312yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(1, matching_regexps.size());
+
+ text = "abcd12yyy32yyyzzz";
+ atoms.clear();
+ atoms.push_back("abc");
+ atoms.push_back("abcd");
+ atoms.push_back("yyy");
+ atoms.push_back("yyyzzz");
+ FindAtomIndices(v.atoms, atoms, &atom_ids);
+ LOG(INFO) << "S: " << atom_ids.size();
+ for (int i = 0; i < atom_ids.size(); i++)
+ LOG(INFO) << "i: " << i << " : " << atom_ids[i];
+ v.f.AllMatches(text, atom_ids, &matching_regexps);
+ EXPECT_EQ(2, matching_regexps.size());
+}
+
+} // namespace re2
diff --git a/re2/testing/mimics_pcre_test.cc b/re2/testing/mimics_pcre_test.cc
new file mode 100644
index 0000000..f965092
--- /dev/null
+++ b/re2/testing/mimics_pcre_test.cc
@@ -0,0 +1,76 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PCRETest {
+ const char* regexp;
+ bool should_match;
+};
+
+static PCRETest tests[] = {
+ // Most things should behave exactly.
+ { "abc", true },
+ { "(a|b)c", true },
+ { "(a*|b)c", true },
+ { "(a|b*)c", true },
+ { "a(b|c)d", true },
+ { "a(()|())c", true },
+ { "ab*c", true },
+ { "ab+c", true },
+ { "a(b*|c*)d", true },
+ { "\\W", true },
+ { "\\W{1,2}", true },
+ { "\\d", true },
+
+ // Check that repeated empty strings do not.
+ { "(a*)*", false },
+ { "x(a*)*y", false },
+ { "(a*)+", false },
+ { "(a+)*", true },
+ { "(a+)+", true },
+ { "(a+)+", true },
+
+ // \v is the only character class that shouldn't.
+ { "\\b", true },
+ { "\\v", false },
+ { "\\d", true },
+
+ // The handling of ^ in multi-line mode is different, as is
+ // the handling of $ in single-line mode. (Both involve
+ // boundary cases if the string ends with \n.)
+ { "\\A", true },
+ { "\\z", true },
+ { "(?m)^", false },
+ { "(?m)$", true },
+ { "(?-m)^", true },
+ { "(?-m)$", false }, // In PCRE, == \Z
+ { "(?m)\\A", true },
+ { "(?m)\\z", true },
+ { "(?-m)\\A", true },
+ { "(?-m)\\z", true },
+};
+
+TEST(MimicsPCRE, SimpleTests) {
+ for (int i = 0; i < arraysize(tests); i++) {
+ const PCRETest& t = tests[i];
+ for (int j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ CHECK(re) << " " << t.regexp;
+ CHECK_EQ(t.should_match, re->MimicsPCRE())
+ << " " << t.regexp << " "
+ << (j==0 ? "latin1" : "utf");
+ re->Decref();
+ }
+ }
+}
+
+} // namespace re2
diff --git a/re2/testing/null_walker.cc b/re2/testing/null_walker.cc
new file mode 100644
index 0000000..09b53cb
--- /dev/null
+++ b/re2/testing/null_walker.cc
@@ -0,0 +1,44 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Null walker. For benchmarking the walker itself.
+
+class NullWalker : public Regexp::Walker<bool> {
+ public:
+ NullWalker() { }
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args);
+
+ bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "NullWalker::ShortVisit called";
+ return a;
+ }
+
+ private:
+ DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
+};
+
+// Called after visiting re's children. child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string). Returns whether this clause can match an
+// empty string.
+bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ return false;
+}
+
+// Returns whether re can match an empty string.
+void Regexp::NullWalk() {
+ NullWalker w;
+ w.Walk(this, false);
+}
+
+} // namespace re2
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
new file mode 100644
index 0000000..a78395a
--- /dev/null
+++ b/re2/testing/parse_test.cc
@@ -0,0 +1,318 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Test {
+ const char* regexp;
+ const char* parse;
+};
+
+static Test tests[] = {
+ // Base cases
+ { "a", "lit{a}" },
+ { "a.", "cat{lit{a}dot{}}" },
+ { "a.b", "cat{lit{a}dot{}lit{b}}" },
+ { "ab", "str{ab}" },
+ { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
+ { "abc", "str{abc}" },
+ { "a|^", "alt{lit{a}bol{}}" },
+ { "a|b", "cc{0x61-0x62}" },
+ { "(a)", "cap{lit{a}}" },
+ { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
+ { "a*", "star{lit{a}}" },
+ { "a+", "plus{lit{a}}" },
+ { "a?", "que{lit{a}}" },
+ { "a{2}", "rep{2,2 lit{a}}" },
+ { "a{2,3}", "rep{2,3 lit{a}}" },
+ { "a{2,}", "rep{2,-1 lit{a}}" },
+ { "a*?", "nstar{lit{a}}" },
+ { "a+?", "nplus{lit{a}}" },
+ { "a??", "nque{lit{a}}" },
+ { "a{2}?", "nrep{2,2 lit{a}}" },
+ { "a{2,3}?", "nrep{2,3 lit{a}}" },
+ { "a{2,}?", "nrep{2,-1 lit{a}}" },
+ { "", "emp{}" },
+ { "|", "alt{emp{}emp{}}" },
+ { ".", "dot{}" },
+ { "^", "bol{}" },
+ { "$", "eol{}" },
+ { "\\|", "lit{|}" },
+ { "\\(", "lit{(}" },
+ { "\\)", "lit{)}" },
+ { "\\*", "lit{*}" },
+ { "\\+", "lit{+}" },
+ { "\\?", "lit{?}" },
+ { "{", "lit{{}" },
+ { "}", "lit{}}" },
+ { "\\.", "lit{.}" },
+ { "\\^", "lit{^}" },
+ { "\\$", "lit{$}" },
+ { "\\\\", "lit{\\}" },
+ { "[ace]", "cc{0x61 0x63 0x65}" },
+ { "[abc]", "cc{0x61-0x63}" },
+ { "[a-z]", "cc{0x61-0x7a}" },
+ { "[a]", "lit{a}" },
+ { "\\-", "lit{-}" },
+ { "-", "lit{-}" },
+ { "\\_", "lit{_}" },
+
+ // Posix and Perl extensions
+ { "[[:lower:]]", "cc{0x61-0x7a}" },
+ { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+ { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
+ { "\\d", "cc{0x30-0x39}" },
+ { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
+ { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
+ { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
+ { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
+ { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
+ { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
+ { "\\C", "byte{}" },
+
+ // More interesting regular expressions.
+ { "a{,2}", "str{a{,2}}" },
+ { "\\.\\^\\$\\\\", "str{.^$\\}" },
+ { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
+ { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+ { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
+ { "a*{", "cat{star{lit{a}}lit{{}}" },
+
+ // Test precedences
+ { "(?:ab)*", "star{str{ab}}" },
+ { "(ab)*", "star{cap{str{ab}}}" },
+ { "ab|cd", "alt{str{ab}str{cd}}" },
+ { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
+
+ // Test flattening.
+ { "(?:a)", "lit{a}" },
+ { "(?:ab)(?:cd)", "str{abcd}" },
+ { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+ { "a|.", "dot{}" },
+ { ".|a", "dot{}" },
+
+ // Test Perl quoted literals
+ { "\\Q+|*?{[\\E", "str{+|*?{[}" },
+ { "\\Q+\\E+", "plus{lit{+}}" },
+ { "\\Q\\\\E", "lit{\\}" },
+ { "\\Q\\\\\\E", "str{\\\\}" },
+
+ // Test Perl \A and \z
+ { "(?m)^", "bol{}" },
+ { "(?m)$", "eol{}" },
+ { "(?-m)^", "bot{}" },
+ { "(?-m)$", "eot{}" },
+ { "(?m)\\A", "bot{}" },
+ { "(?m)\\z", "eot{}" },
+ { "(?-m)\\A", "bot{}" },
+ { "(?-m)\\z", "eot{}" },
+
+ // Test named captures
+ { "(?P<name>a)", "cap{name:lit{a}}" },
+
+ // Case-folded literals
+ { "[Aa]", "litfold{a}" },
+
+ // Strings
+ { "abcde", "str{abcde}" },
+ { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
+};
+
+static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | Regexp::PerlX | Regexp::PerlClasses;
+
+void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, const string& title) {
+ for (int i = 0; i < ntests; i++) {
+ RegexpStatus status;
+ Regexp* re = Regexp::Parse(tests[i].regexp, flags, &status);
+ CHECK(re != NULL) << " " << tests[i].regexp << " "
+ << status.Text();
+ string s = re->Dump();
+ EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp;
+ re->Decref();
+ }
+}
+
+// Test that regexps parse to expected structures.
+TEST(TestParse, SimpleRegexps) {
+ TestParse(tests, arraysize(tests), kTestFlags, "simple");
+}
+
+Test foldcase_tests[] = {
+ { "AbCdE", "strfold{abcde}" },
+ { "[Aa]", "litfold{a}" },
+ { "a", "litfold{a}" },
+
+ // 0x17F is an old English long s (looks like an f) and folds to s.
+ // 0x212A is the Kelvin symbol and folds to k.
+ { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
+ { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+};
+
+// Test that parsing with FoldCase works.
+TEST(TestParse, FoldCase) {
+ TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
+}
+
+Test literal_tests[] = {
+ { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
+};
+
+// Test that parsing with Literal works.
+TEST(TestParse, Literal) {
+ TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
+}
+
+Test matchnl_tests[] = {
+ { ".", "dot{}" },
+ { "\n", "lit{\n}" },
+ { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
+ { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+// Test that parsing with MatchNL works.
+// (Also tested above during simple cases.)
+TEST(TestParse, MatchNL) {
+ TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
+}
+
+Test nomatchnl_tests[] = {
+ { ".", "cc{0-0x9 0xb-0x10ffff}" },
+ { "\n", "lit{\n}" },
+ { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
+ { "[a\\n]", "cc{0xa 0x61}" },
+};
+
+
+// Test that parsing without MatchNL works.
+TEST(TestParse, NoMatchNL) {
+ TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
+}
+
+// Invalid regular expressions
+const char* badtests[] = {
+ "(",
+ ")",
+ "(a",
+ "(a|b|",
+ "(a|b",
+ "[a-z",
+ "([a-z)",
+ "x{1001}",
+ "\xff", // Invalid UTF-8
+ "[\xff]",
+ "[\\\xff]",
+ "\\\xff",
+ "(?P<name>a",
+ "(?P<name>",
+ "(?P<name",
+ "(?P<x y>a)",
+ "(?P<>a)",
+ "[a-Z]",
+ "(?i)[a-Z]",
+};
+
+// Valid in Perl, bad in POSIX
+const char* only_perl[] = {
+ "[a-b-c]",
+ "\\Qabc\\E",
+ "\\Q*+?{[\\E",
+ "\\Q\\\\E",
+ "\\Q\\\\\\E",
+ "\\Q\\\\\\\\E",
+ "\\Q\\\\\\\\\\E",
+ "(?:a)",
+ "(?P<name>a)",
+};
+
+// Valid in POSIX, bad in Perl.
+const char* only_posix[] = {
+ "a++",
+ "a**",
+ "a?*",
+ "a+*",
+ "a{1}*",
+};
+
+// Test that parser rejects bad regexps.
+TEST(TestParse, InvalidRegexps) {
+ for (int i = 0; i < arraysize(badtests); i++) {
+ CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
+ << " " << badtests[i];
+ CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
+ << " " << badtests[i];
+ }
+ for (int i = 0; i < arraysize(only_posix); i++) {
+ CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
+ << " " << only_posix[i];
+ Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
+ CHECK(re) << " " << only_posix[i];
+ re->Decref();
+ }
+ for (int i = 0; i < arraysize(only_perl); i++) {
+ CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
+ << " " << only_perl[i];
+ Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
+ CHECK(re) << " " << only_perl[i];
+ re->Decref();
+ }
+}
+
+// Test that ToString produces original regexp or equivalent one.
+TEST(TestToString, EquivalentParse) {
+ for (int i = 0; i < arraysize(tests); i++) {
+ RegexpStatus status;
+ Regexp* re = Regexp::Parse(tests[i].regexp, kTestFlags, &status);
+ CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+ string s = re->Dump();
+ EXPECT_EQ(string(tests[i].parse), s);
+ string t = re->ToString();
+ if (t != tests[i].regexp) {
+ // If ToString didn't return the original regexp,
+ // it must have found one with fewer parens.
+ // Unfortunately we can't check the length here, because
+ // ToString produces "\\{" for a literal brace,
+ // but "{" is a shorter equivalent.
+ // CHECK_LT(t.size(), strlen(tests[i].regexp))
+ // << " t=" << t << " regexp=" << tests[i].regexp;
+
+ // Test that if we parse the new regexp we get the same structure.
+ Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+ CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
+ string ss = nre->Dump();
+ string tt = nre->ToString();
+ if (s != ss || t != tt)
+ LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
+ EXPECT_EQ(s, ss);
+ EXPECT_EQ(t, tt);
+ nre->Decref();
+ }
+ re->Decref();
+ }
+}
+
+// Test that capture error args are correct.
+TEST(NamedCaptures, ErrorArgs) {
+ RegexpStatus status;
+ Regexp* re;
+
+ re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?P<name");
+
+ re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?P<space bar>");
+}
+
+} // namespace re2
diff --git a/re2/testing/possible_match_test.cc b/re2/testing/possible_match_test.cc
new file mode 100644
index 0000000..a71aa67
--- /dev/null
+++ b/re2/testing/possible_match_test.cc
@@ -0,0 +1,245 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <vector>
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Test that C++ strings are compared as uint8s, not int8s.
+// PossibleMatchRange doesn't depend on this, but callers probably will.
+TEST(CplusplusStrings, EightBit) {
+ string s = "\x70";
+ string t = "\xA0";
+ EXPECT_LT(s, t);
+}
+
+struct PrefixTest {
+ const char* regexp;
+ int maxlen;
+ const char* min;
+ const char* max;
+};
+
+static PrefixTest tests[] = {
+ { "", 10, "", "", },
+ { "Abcdef", 10, "Abcdef", "Abcdef" },
+ { "abc(def|ghi)", 10, "abcdef", "abcghi" },
+ { "a+hello", 10, "aa", "ahello" },
+ { "a*hello", 10, "a", "hello" },
+ { "def|abc", 10, "abc", "def" },
+ { "a(b)(c)[d]", 10, "abcd", "abcd" },
+ { "ab(cab|cat)", 10, "abcab", "abcat" },
+ { "ab(cab|ca)x", 10, "abcabx", "abcax" },
+ { "(ab|x)(c|de)", 10, "abc", "xde" },
+ { "(ab|x)?(c|z)?", 10, "", "z" },
+ { "[^\\s\\S]", 10, "", "" },
+ { "(abc)+", 5, "abc", "abcac" },
+ { "(abc)+", 2, "ab", "ac" },
+ { "(abc)+", 1, "a", "b" },
+ { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
+ { "a*", 10, "", "ab" },
+
+ { "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
+ { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
+ { "(?i)a+hello", 10, "AA", "ahello" },
+ { "(?i)a*hello", 10, "A", "hello" },
+ { "(?i)def|abc", 10, "ABC", "def" },
+ { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
+ { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
+ { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
+ { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
+ { "(?i)(ab|x)?(c|z)?", 10, "", "z" },
+ { "(?i)[^\\s\\S]", 10, "", "" },
+ { "(?i)(abc)+", 5, "ABC", "abcac" },
+ { "(?i)(abc)+", 2, "AB", "ac" },
+ { "(?i)(abc)+", 1, "A", "b" },
+ { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
+ { "(?i)a*", 10, "", "ab" },
+ { "(?i)A*", 10, "", "ab" },
+
+ { "\\AAbcdef", 10, "Abcdef", "Abcdef" },
+ { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
+ { "\\Aa+hello", 10, "aa", "ahello" },
+ { "\\Aa*hello", 10, "a", "hello" },
+ { "\\Adef|abc", 10, "abc", "def" },
+ { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
+ { "\\Aab(cab|cat)", 10, "abcab", "abcat" },
+ { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
+ { "\\A(ab|x)(c|de)", 10, "abc", "xde" },
+ { "\\A(ab|x)?(c|z)?", 10, "", "z" },
+ { "\\A[^\\s\\S]", 10, "", "" },
+ { "\\A(abc)+", 5, "abc", "abcac" },
+ { "\\A(abc)+", 2, "ab", "ac" },
+ { "\\A(abc)+", 1, "a", "b" },
+ { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
+ { "\\Aa*", 10, "", "ab" },
+
+ { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
+ { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
+ { "(?i)\\Aa+hello", 10, "AA", "ahello" },
+ { "(?i)\\Aa*hello", 10, "A", "hello" },
+ { "(?i)\\Adef|abc", 10, "ABC", "def" },
+ { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
+ { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
+ { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
+ { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
+ { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
+ { "(?i)\\A[^\\s\\S]", 10, "", "" },
+ { "(?i)\\A(abc)+", 5, "ABC", "abcac" },
+ { "(?i)\\A(abc)+", 2, "AB", "ac" },
+ { "(?i)\\A(abc)+", 1, "A", "b" },
+ { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
+ { "(?i)\\Aa*", 10, "", "ab" },
+ { "(?i)\\AA*", 10, "", "ab" },
+};
+
+TEST(PossibleMatchRange, HandWritten) {
+ for (int i = 0; i < arraysize(tests); i++) {
+ for (int j = 0; j < 2; j++) {
+ const PrefixTest& t = tests[i];
+ string min, max;
+ if (j == 0) {
+ LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
+ << " " << t.regexp;
+ delete prog;
+ re->Decref();
+ } else {
+ CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
+ }
+ EXPECT_EQ(t.min, min) << t.regexp;
+ EXPECT_EQ(t.max, max) << t.regexp;
+ }
+ }
+}
+
+// Test cases where PossibleMatchRange should return false.
+TEST(PossibleMatchRange, Failures) {
+ string min, max;
+
+ // Fails because no room to write max.
+ EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
+
+ // Fails because there is no max -- any non-empty string matches
+ // or begins a match. Have to use Latin-1 input, because there
+ // are no valid UTF-8 strings beginning with byte 0xFF.
+ EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ EXPECT_FALSE(RE2(".*", RE2::Latin1).
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ EXPECT_FALSE(RE2("\\C*").
+ PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+
+ // Fails because it's a malformed regexp.
+ EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
+ << "min=" << CEscape(min) << ", max=" << CEscape(max);
+}
+
+// Exhaustive test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that the prefix information agrees with whether
+// the regexp matches each of the strings.
+class PossibleMatchTester : public RegexpGenerator {
+ public:
+ PossibleMatchTester(int maxatoms,
+ int maxops,
+ const vector<string>& alphabet,
+ const vector<string>& ops,
+ int maxstrlen,
+ const vector<string>& stralphabet)
+ : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+ strgen_(maxstrlen, stralphabet),
+ regexps_(0), tests_(0) { }
+
+ int regexps() { return regexps_; }
+ int tests() { return tests_; }
+
+ // Needed for RegexpGenerator interface.
+ void HandleRegexp(const string& regexp);
+
+ private:
+ StringGenerator strgen_;
+
+ int regexps_; // Number of HandleRegexp calls
+ int tests_; // Number of regexp tests.
+
+ DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
+};
+
+// Processes a single generated regexp.
+// Checks that all accepted strings agree with the prefix range.
+void PossibleMatchTester::HandleRegexp(const string& regexp) {
+ regexps_++;
+
+ VLOG(3) << CEscape(regexp);
+
+ RE2 re(regexp, RE2::Latin1);
+ CHECK_EQ(re.error(), "");
+
+ string min, max;
+ if(!re.PossibleMatchRange(&min, &max, 10)) {
+ // There's no good max for "\\C*". Can't use strcmp
+ // because sometimes it gets embedded in more
+ // complicated expressions.
+ if(strstr(regexp.c_str(), "\\C*"))
+ return;
+ LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
+ }
+
+ strgen_.Reset();
+ while (strgen_.HasNext()) {
+ const StringPiece& s = strgen_.Next();
+ tests_++;
+ if (!RE2::FullMatch(s, re))
+ continue;
+ CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
+ CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
+ }
+}
+
+// Try all regular expressions made from up to
+// 3 instances of a b [0-9] and up to 4 egrep operators,
+// and then try all strings of length up to 6 using
+// the characters {a,b,4}.
+// Try less in debug mode, because it's so slow.
+TEST(PossibleMatchRange, Exhaustive) {
+ int natom = 3;
+ int noperator = 4;
+ int stringlen = 6;
+ if (DEBUG_MODE) {
+ natom = 2;
+ noperator = 3;
+ stringlen = 4;
+ }
+ PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
+ RegexpGenerator::EgrepOps(),
+ stringlen, Explode("ab4"));
+ t.Generate();
+ LOG(INFO) << t.regexps() << " regexps, "
+ << t.tests() << " tests";
+}
+
+} // namespace re2
diff --git a/re2/testing/random_test.cc b/re2/testing/random_test.cc
new file mode 100644
index 0000000..6d6b5eb
--- /dev/null
+++ b/re2/testing/random_test.cc
@@ -0,0 +1,87 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Random testing of regular expression matching.
+
+#include <stdio.h>
+#include "util/test.h"
+#include "re2/testing/exhaustive_tester.h"
+
+DEFINE_int32(regexpseed, 404, "Random regexp seed.");
+DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
+DEFINE_int32(stringseed, 200, "Random string seed.");
+DEFINE_int32(stringcount, 100, "How many random strings to generate.");
+
+namespace re2 {
+
+// Runs a random test on the given parameters.
+// (Always uses the same random seeds for reproducibility.
+// Can give different seeds on command line.)
+static void RandomTest(int maxatoms, int maxops,
+ const vector<string>& alphabet,
+ const vector<string>& ops,
+ int maxstrlen, const vector<string>& stralphabet,
+ const string& wrapper) {
+ ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
+ maxstrlen, stralphabet, wrapper, "");
+ t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
+ t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
+ printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+ EXPECT_EQ(0, t.failures());
+}
+
+// Tests random small regexps involving literals and egrep operators.
+TEST(Random, SmallEgrepLiterals) {
+ RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
+ 20, Explode("abc"),
+ "");
+}
+
+// Tests random bigger regexps involving literals and egrep operators.
+TEST(Random, BigEgrepLiterals) {
+ RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
+ 20, Explode("abc"),
+ "");
+}
+
+// Tests random small regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, SmallEgrepCaptures) {
+ RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+ 20, Explode("abc"),
+ "");
+}
+
+// Tests random bigger regexps involving literals, capturing parens,
+// and egrep operators.
+TEST(Random, BigEgrepCaptures) {
+ RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
+ 20, Explode("abc"),
+ "");
+}
+
+// Tests random large complicated expressions, using all the possible
+// operators, some literals, some parenthesized literals, and predefined
+// character classes like \d. (Adding larger character classes would
+// make for too many possibilities.)
+TEST(Random, Complicated) {
+ vector<string> ops = Split(" ",
+ "%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
+ "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
+ "%s{2} %s{2,} %s{3,4} %s{4,5}");
+
+ // Use (?:\b) and (?:\B) instead of \b and \B,
+ // because PCRE rejects \b* but accepts (?:\b)*.
+ // Ditto ^ and $.
+ vector<string> atoms = Split(" ",
+ ". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
+ "\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
+ "a (a) b c - \\\\");
+ vector<string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
+ RandomTest(10, 10, atoms, ops, 20, alphabet, "");
+}
+
+} // namespace re2
+
diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc
new file mode 100644
index 0000000..f16b056
--- /dev/null
+++ b/re2/testing/re2_arg_test.cc
@@ -0,0 +1,140 @@
+// Copyright 2005 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This tests to make sure numbers are parsed from strings
+// correctly.
+// Todo: Expand the test to validate strings parsed to the other types
+// supported by RE2::Arg class
+
+#include "util/test.h"
+#include "re2/re2.h"
+
+namespace re2 {
+
+struct SuccessTable {
+ const char * value_string;
+ union {
+ int64 value_int64;
+ uint64 value_uint64;
+ int value_int;
+ uint32 value_uint32;
+ int16 value_int16;
+ uint16 value_uint16;
+ };
+ bool success[6];
+};
+
+// Test boundary cases for different integral sizes.
+// Specifically I want to make sure that values outside the boundries
+// of an integral type will fail and that negative numbers will fail
+// for unsigned types. The following table contains the boundaries for
+// the various integral types and has entries for whether or not each
+// type can contain the given value.
+const SuccessTable kSuccessTable[] = {
+// string integer value short ushort int uint int64 uint64
+// 0 to 2^7-1
+{ "0", {0}, { true, true, true, true, true, true }},
+{ "127", {127}, { true, true, true, true, true, true }},
+
+// -1 to -2^7
+{ "-1", {-1}, { true, false, true, false, true, false }},
+{ "-128", {-128}, { true, false, true, false, true, false }},
+
+// 2^7 to 2^8-1
+{ "128", {128}, { true, true, true, true, true, true, }},
+{ "255", {255}, { true, true, true, true, true, true, }},
+
+// 2^8 to 2^15-1
+{ "256", {256}, { true , true, true, true, true, true }},
+{ "32767", {32767}, { true , true, true, true, true, true }},
+
+// -2^7-1 to -2^15
+{ "-129", {-129}, { true, false, true, false, true, false }},
+{ "-32768", {-32768}, { true, false, true, false, true, false }},
+
+// 2^15 to 2^16-1
+{ "32768", {32768}, { false, true, true, true, true, true }},
+{ "65535", {65535}, { false, true, true, true, true, true }},
+
+// 2^16 to 2^31-1
+{ "65536", {65536}, { false, false, true, true, true, true }},
+{ "2147483647", {2147483647}, { false, false, true, true, true, true }},
+
+// -2^15-1 to -2^31
+{ "-32769", {-32769}, { false, false, true, false, true, false }},
+{ "-2147483648",
+ {0xFFFFFFFF80000000LL}, { false, false, true, false, true, false }},
+
+// 2^31 to 2^32-1
+{ "2147483648", {2147483648U}, { false, false, false, true, true, true }},
+{ "4294967295", {4294967295U}, { false, false, false, true, true, true }},
+
+// 2^32 to 2^63-1
+{ "4294967296", {4294967296LL}, { false, false, false, false, true, true }},
+{ "9223372036854775807",
+ {9223372036854775807LL}, { false, false, false, false, true, true }},
+
+// -2^31-1 to -2^63
+{ "-2147483649",{-2147483649LL},{ false, false, false, false, true, false }},
+{ "-9223372036854775808",
+ {0x8000000000000000LL}, { false, false, false, false, true, false }},
+
+// 2^63 to 2^64-1
+{ "9223372036854775808",
+ {9223372036854775808ULL}, { false, false, false, false, false, true }},
+{ "18446744073709551615",
+ {18446744073709551615ULL}, { false, false, false, false, false, true }},
+
+// >= 2^64
+{ "18446744073709551616",
+ {0}, { false, false, false, false, false, false }},
+};
+
+const int kNumStrings = ARRAYSIZE(kSuccessTable);
+
+// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M
+// macro outside of a TEST block and this seems to be the only way to
+// avoid code duplication. I can also pull off a couple nice tricks
+// using concatenation for the type I'm checking against.
+#define PARSE_FOR_TYPE(type, column) { \
+ type r; \
+ for ( int i = 0; i < kNumStrings; ++i ) { \
+ RE2::Arg arg(&r); \
+ const char* const p = kSuccessTable[i].value_string; \
+ bool retval = arg.Parse(p, strlen(p)); \
+ bool success = kSuccessTable[i].success[column]; \
+ ASSERT_TRUE_M(retval == success, \
+ StringPrintf("Parsing '%s' for type " #type " should return %d", \
+ p, success).c_str()); \
+ if ( success ) { \
+ ASSERT_EQUALS(r, kSuccessTable[i].value_##type); \
+ } \
+ } \
+}
+
+TEST(REArgTest, Int16Test) {
+ PARSE_FOR_TYPE(int16, 0);
+}
+
+TEST(REArgTest, Uint16Test) {
+ PARSE_FOR_TYPE(uint16, 1);
+}
+
+TEST(REArgTest, IntTest) {
+ PARSE_FOR_TYPE(int, 2);
+}
+
+TEST(REArgTest, UInt32Test) {
+ PARSE_FOR_TYPE(uint32, 3);
+}
+
+TEST(REArgTest, Iint64Test) {
+ PARSE_FOR_TYPE(int64, 4);
+}
+
+TEST(REArgTest, Uint64Test) {
+ PARSE_FOR_TYPE(uint64, 5);
+}
+
+} // namespace re2
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
new file mode 100644
index 0000000..256df1f
--- /dev/null
+++ b/re2/testing/re2_test.cc
@@ -0,0 +1,1176 @@
+// -*- coding: utf-8 -*-
+// Copyright 2002-2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO: Test extractions for PartialMatch/Consume
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <vector>
+#include "util/test.h"
+#include "re2/re2.h"
+
+DECLARE_bool(logtostderr);
+
+namespace re2 {
+
+TEST(RE2, HexTests) {
+
+ VLOG(1) << "hex tests";
+
+#define CHECK_HEX(type, value) \
+ do { \
+ type v; \
+ CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
+ CHECK_EQ(v, 0x ## value); \
+ CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
+ CHECK_EQ(v, 0x ## value); \
+ } while(0)
+
+ CHECK_HEX(short, 2bad);
+ CHECK_HEX(unsigned short, 2badU);
+ CHECK_HEX(int, dead);
+ CHECK_HEX(unsigned int, deadU);
+ CHECK_HEX(long, 7eadbeefL);
+ CHECK_HEX(unsigned long, deadbeefUL);
+ CHECK_HEX(long long, 12345678deadbeefLL);
+ CHECK_HEX(unsigned long long, cafebabedeadbeefULL);
+
+#undef CHECK_HEX
+}
+
+TEST(RE2, OctalTests) {
+ VLOG(1) << "octal tests";
+
+#define CHECK_OCTAL(type, value) \
+ do { \
+ type v; \
+ CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
+ CHECK_EQ(v, 0 ## value); \
+ CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
+ CHECK_EQ(v, 0 ## value); \
+ } while(0)
+
+ CHECK_OCTAL(short, 77777);
+ CHECK_OCTAL(unsigned short, 177777U);
+ CHECK_OCTAL(int, 17777777777);
+ CHECK_OCTAL(unsigned int, 37777777777U);
+ CHECK_OCTAL(long, 17777777777L);
+ CHECK_OCTAL(unsigned long, 37777777777UL);
+ CHECK_OCTAL(long long, 777777777777777777777LL);
+ CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL);
+
+#undef CHECK_OCTAL
+}
+
+TEST(RE2, DecimalTests) {
+ VLOG(1) << "decimal tests";
+
+#define CHECK_DECIMAL(type, value) \
+ do { \
+ type v; \
+ CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
+ CHECK_EQ(v, value); \
+ CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
+ CHECK_EQ(v, value); \
+ } while(0)
+
+ CHECK_DECIMAL(short, -1);
+ CHECK_DECIMAL(unsigned short, 9999);
+ CHECK_DECIMAL(int, -1000);
+ CHECK_DECIMAL(unsigned int, 12345U);
+ CHECK_DECIMAL(long, -10000000L);
+ CHECK_DECIMAL(unsigned long, 3083324652U);
+ CHECK_DECIMAL(long long, -100000000000000LL);
+ CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL);
+
+#undef CHECK_DECIMAL
+}
+
+TEST(RE2, Replace) {
+ VLOG(1) << "TestReplace";
+
+ struct ReplaceTest {
+ const char *regexp;
+ const char *rewrite;
+ const char *original;
+ const char *single;
+ const char *global;
+ int greplace_count;
+ };
+ static const ReplaceTest tests[] = {
+ { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
+ "\\2\\1ay",
+ "the quick brown fox jumps over the lazy dogs.",
+ "ethay quick brown fox jumps over the lazy dogs.",
+ "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
+ 9 },
+ { "\\w+",
+ "\\0-NOSPAM",
+ "[email protected]",
+ "[email protected]",
+ "[email protected]",
+ 4 },
+ { "^",
+ "(START)",
+ "foo",
+ "(START)foo",
+ "(START)foo",
+ 1 },
+ { "^",
+ "(START)",
+ "",
+ "(START)",
+ "(START)",
+ 1 },
+ { "$",
+ "(END)",
+ "",
+ "(END)",
+ "(END)",
+ 1 },
+ { "b",
+ "bb",
+ "ababababab",
+ "abbabababab",
+ "abbabbabbabbabb",
+ 5 },
+ { "b",
+ "bb",
+ "bbbbbb",
+ "bbbbbbb",
+ "bbbbbbbbbbbb",
+ 6 },
+ { "b+",
+ "bb",
+ "bbbbbb",
+ "bb",
+ "bb",
+ 1 },
+ { "b*",
+ "bb",
+ "bbbbbb",
+ "bb",
+ "bb",
+ 1 },
+ { "b*",
+ "bb",
+ "aaaaa",
+ "bbaaaaa",
+ "bbabbabbabbabbabb",
+ 6 },
+ // Check newline handling
+ { "a.*a",
+ "(\\0)",
+ "aba\naba",
+ "(aba)\naba",
+ "(aba)\n(aba)",
+ 2 },
+ { "", NULL, NULL, NULL, NULL, 0 }
+ };
+
+ for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
+ VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite);
+ string one(t->original);
+ CHECK(RE2::Replace(&one, t->regexp, t->rewrite));
+ CHECK_EQ(one, t->single);
+ string all(t->original);
+ CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
+ << "Got: " << all;
+ CHECK_EQ(all, t->global);
+ }
+}
+
+static void TestCheckRewriteString(const char* regexp, const char* rewrite,
+ bool expect_ok) {
+ string error;
+ RE2 exp(regexp);
+ bool actual_ok = exp.CheckRewriteString(rewrite, &error);
+ EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
+}
+
+TEST(CheckRewriteString, all) {
+ TestCheckRewriteString("abc", "foo", true);
+ TestCheckRewriteString("abc", "foo\\", false);
+ TestCheckRewriteString("abc", "foo\\0bar", true);
+
+ TestCheckRewriteString("a(b)c", "foo", true);
+ TestCheckRewriteString("a(b)c", "foo\\0bar", true);
+ TestCheckRewriteString("a(b)c", "foo\\1bar", true);
+ TestCheckRewriteString("a(b)c", "foo\\2bar", false);
+ TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
+
+ TestCheckRewriteString("a(b)(c)", "foo\\12", true);
+ TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
+ TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
+}
+
+TEST(RE2, Extract) {
+ VLOG(1) << "TestExtract";
+
+ string s;
+
+ CHECK(RE2::Extract("[email protected]", "(.*)@([^.]*)", "\\2!\\1", &s));
+ CHECK_EQ(s, "kremvax!boris");
+
+ CHECK(RE2::Extract("foo", ".*", "'\\0'", &s));
+ CHECK_EQ(s, "'foo'");
+ // check that false match doesn't overwrite
+ CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s));
+ CHECK_EQ(s, "'foo'");
+}
+
+TEST(RE2, Consume) {
+ VLOG(1) << "TestConsume";
+
+ RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
+ string word;
+
+ string s(" aaa b!@#$@#$cccc");
+ StringPiece input(s);
+
+ CHECK(RE2::Consume(&input, r, &word));
+ CHECK_EQ(word, "aaa") << " input: " << input;
+ CHECK(RE2::Consume(&input, r, &word));
+ CHECK_EQ(word, "b") << " input: " << input;
+ CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input;
+}
+
+TEST(RE2, FindAndConsume) {
+ VLOG(1) << "TestFindAndConsume";
+
+ RE2 r("(\\w+)"); // matches a word
+ string word;
+
+ string s(" aaa b!@#$@#$cccc");
+ StringPiece input(s);
+
+ CHECK(RE2::FindAndConsume(&input, r, &word));
+ CHECK_EQ(word, "aaa");
+ CHECK(RE2::FindAndConsume(&input, r, &word));
+ CHECK_EQ(word, "b");
+ CHECK(RE2::FindAndConsume(&input, r, &word));
+ CHECK_EQ(word, "cccc");
+ CHECK(! RE2::FindAndConsume(&input, r, &word));
+
+ // Check that FindAndConsume works without any submatches.
+ // Earlier version used uninitialized data for
+ // length to consume.
+ input = "aaa";
+ CHECK(RE2::FindAndConsume(&input, "aaa"));
+ CHECK_EQ(input, "");
+}
+
+TEST(RE2, MatchNumberPeculiarity) {
+ VLOG(1) << "TestMatchNumberPeculiarity";
+
+ RE2 r("(foo)|(bar)|(baz)");
+ string word1;
+ string word2;
+ string word3;
+
+ CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
+ CHECK_EQ(word1, "foo");
+ CHECK_EQ(word2, "");
+ CHECK_EQ(word3, "");
+ CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
+ CHECK_EQ(word1, "");
+ CHECK_EQ(word2, "bar");
+ CHECK_EQ(word3, "");
+ CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
+ CHECK_EQ(word1, "");
+ CHECK_EQ(word2, "");
+ CHECK_EQ(word3, "baz");
+ CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3));
+
+ string a;
+ CHECK(RE2::FullMatch("hello", "(foo)|hello", &a));
+ CHECK_EQ(a, "");
+}
+
+TEST(RE2, Match) {
+ RE2 re("((\\w+):([0-9]+))"); // extracts host and port
+ StringPiece group[4];
+
+ // No match.
+ CHECK(!re.Match("zyzzyva", 0, RE2::UNANCHORED,
+ group, arraysize(group)));
+
+ // Matches and extracts.
+ CHECK(re.Match("a chrisr:9000 here", 0, RE2::UNANCHORED,
+ group, arraysize(group)));
+ CHECK_EQ(group[0], "chrisr:9000");
+ CHECK_EQ(group[1], "chrisr:9000");
+ CHECK_EQ(group[2], "chrisr");
+ CHECK_EQ(group[3], "9000");
+
+ string all, host;
+ int port;
+ CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
+ CHECK_EQ(all, "chrisr:9000");
+ CHECK_EQ(host, "chrisr");
+ CHECK_EQ(port, 9000);
+}
+
+static void TestRecursion(int size, const char *pattern) {
+ // Fill up a string repeating the pattern given
+ string domain;
+ domain.resize(size);
+ int patlen = strlen(pattern);
+ for (int i = 0; i < size; ++i) {
+ domain[i] = pattern[i % patlen];
+ }
+ // Just make sure it doesn't crash due to too much recursion.
+ RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
+ RE2::FullMatch(domain, re);
+}
+
+// A meta-quoted string, interpreted as a pattern, should always match
+// the original unquoted string.
+static void TestQuoteMeta(string unquoted,
+ const RE2::Options& options = RE2::DefaultOptions) {
+ string quoted = RE2::QuoteMeta(unquoted);
+ RE2 re(quoted, options);
+ EXPECT_TRUE_M(RE2::FullMatch(unquoted, re),
+ "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
+}
+
+// A meta-quoted string, interpreted as a pattern, should always match
+// the original unquoted string.
+static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
+ const RE2::Options& options = RE2::DefaultOptions) {
+ string quoted = RE2::QuoteMeta(unquoted);
+ RE2 re(quoted, options);
+ EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re),
+ "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
+}
+
+// Tests that quoted meta characters match their original strings,
+// and that a few things that shouldn't match indeed do not.
+TEST(QuoteMeta, Simple) {
+ TestQuoteMeta("foo");
+ TestQuoteMeta("foo.bar");
+ TestQuoteMeta("foo\\.bar");
+ TestQuoteMeta("[1-9]");
+ TestQuoteMeta("1.5-2.0?");
+ TestQuoteMeta("\\d");
+ TestQuoteMeta("Who doesn't like ice cream?");
+ TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
+ TestQuoteMeta("((?!)xxx).*yyy");
+ TestQuoteMeta("([");
+}
+TEST(QuoteMeta, SimpleNegative) {
+ NegativeTestQuoteMeta("foo", "bar");
+ NegativeTestQuoteMeta("...", "bar");
+ NegativeTestQuoteMeta("\\.", ".");
+ NegativeTestQuoteMeta("\\.", "..");
+ NegativeTestQuoteMeta("(a)", "a");
+ NegativeTestQuoteMeta("(a|b)", "a");
+ NegativeTestQuoteMeta("(a|b)", "(a)");
+ NegativeTestQuoteMeta("(a|b)", "a|b");
+ NegativeTestQuoteMeta("[0-9]", "0");
+ NegativeTestQuoteMeta("[0-9]", "0-9");
+ NegativeTestQuoteMeta("[0-9]", "[9]");
+ NegativeTestQuoteMeta("((?!)xxx)", "xxx");
+}
+
+TEST(QuoteMeta, Latin1) {
+ TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
+}
+
+TEST(QuoteMeta, UTF8) {
+ TestQuoteMeta("Plácido Domingo");
+ TestQuoteMeta("xyz"); // No fancy utf8.
+ TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol.
+ TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character.
+ TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime.
+ TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note.
+ TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should
+ // still work.
+ NegativeTestQuoteMeta("27\xc2\xb0",
+ "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol.
+}
+
+TEST(QuoteMeta, HasNull) {
+ string has_null;
+
+ // string with one null character
+ has_null += '\0';
+ TestQuoteMeta(has_null);
+ NegativeTestQuoteMeta(has_null, "");
+
+ // Don't want null-followed-by-'1' to be interpreted as '\01'.
+ has_null += '1';
+ TestQuoteMeta(has_null);
+ NegativeTestQuoteMeta(has_null, "\1");
+}
+
+TEST(ProgramSize, BigProgram) {
+ RE2 re_simple("simple regexp");
+ RE2 re_medium("medium.*regexp");
+ RE2 re_complex("hard.{1,128}regexp");
+
+ CHECK_GT(re_simple.ProgramSize(), 0);
+ CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
+ CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
+}
+
+// Issue 956519: handling empty character sets was
+// causing NULL dereference. This tests a few empty character sets.
+// (The way to get an empty character set is to negate a full one.)
+TEST(EmptyCharset, Fuzz) {
+ static const char *empties[] = {
+ "[^\\S\\s]",
+ "[^\\S[:space:]]",
+ "[^\\D\\d]",
+ "[^\\D[:digit:]]"
+ };
+ for (int i = 0; i < arraysize(empties); i++)
+ CHECK(!RE2(empties[i]).Match("abc", 0, RE2::UNANCHORED, NULL, 0));
+}
+
+// Test that named groups work correctly.
+TEST(Capture, NamedGroups) {
+ {
+ RE2 re("(hello world)");
+ CHECK_EQ(re.NumberOfCapturingGroups(), 1);
+ const map<string, int>& m = re.NamedCapturingGroups();
+ CHECK_EQ(m.size(), 0);
+ }
+
+ {
+ RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
+ CHECK_EQ(re.NumberOfCapturingGroups(), 6);
+ const map<string, int>& m = re.NamedCapturingGroups();
+ CHECK_EQ(m.size(), 4);
+ CHECK_EQ(m.find("A")->second, 1);
+ CHECK_EQ(m.find("B")->second, 2);
+ CHECK_EQ(m.find("C")->second, 3);
+ CHECK_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous
+ }
+}
+
+TEST(RE2, FullMatchWithNoArgs) {
+ /***** FullMatch with no args *****/
+ CHECK(RE2::FullMatch("h", "h"));
+ CHECK(RE2::FullMatch("hello", "hello"));
+ CHECK(RE2::FullMatch("hello", "h.*o"));
+ CHECK(!RE2::FullMatch("othello", "h.*o")); // Must be anchored at front
+ CHECK(!RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end
+}
+
+TEST(RE2, PartialMatch) {
+ /***** PartialMatch *****/
+
+ CHECK(RE2::PartialMatch("x", "x"));
+ CHECK(RE2::PartialMatch("hello", "h.*o"));
+ CHECK(RE2::PartialMatch("othello", "h.*o"));
+ CHECK(RE2::PartialMatch("hello!", "h.*o"));
+ CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
+}
+
+TEST(RE2, FullMatchZeroArg) {
+ // Zero-arg
+ CHECK(RE2::FullMatch("1001", "\\d+"));
+}
+
+TEST(RE2, FullMatchOneArg) {
+ int i;
+
+ // Single-arg
+ CHECK(RE2::FullMatch("1001", "(\\d+)", &i));
+ CHECK_EQ(i, 1001);
+ CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i));
+ CHECK_EQ(i, -123);
+ CHECK(!RE2::FullMatch("10", "()\\d+", &i));
+ CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890",
+ "(\\d+)", &i));
+}
+
+TEST(RE2, FullMatchIntegerArg) {
+ int i;
+
+ // Digits surrounding integer-arg
+ CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i));
+ CHECK_EQ(i, 23);
+ CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i));
+ CHECK_EQ(i, 1);
+ CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
+ CHECK_EQ(i, -1);
+ CHECK(RE2::PartialMatch("1234", "(\\d)", &i));
+ CHECK_EQ(i, 1);
+ CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i));
+ CHECK_EQ(i, -1);
+}
+
+TEST(RE2, FullMatchStringArg) {
+ string s;
+ // String-arg
+ CHECK(RE2::FullMatch("hello", "h(.*)o", &s));
+ CHECK_EQ(s, string("ell"));
+}
+
+TEST(RE2, FullMatchStringPieceArg) {
+ int i;
+ // StringPiece-arg
+ StringPiece sp;
+ CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
+ CHECK_EQ(sp.size(), 4);
+ CHECK(memcmp(sp.data(), "ruby", 4) == 0);
+ CHECK_EQ(i, 1234);
+}
+
+TEST(RE2, FullMatchMultiArg) {
+ int i;
+ string s;
+ // Multi-arg
+ CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
+ CHECK_EQ(s, string("ruby"));
+ CHECK_EQ(i, 1234);
+}
+
+TEST(RE2, FullMatchIgnoredArg) {
+ int i;
+ string s;
+ // Ignored arg
+ CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
+ CHECK_EQ(s, string("ruby"));
+ CHECK_EQ(i, 1234);
+}
+
+TEST(RE2, FullMatchTypedNullArg) {
+ string s;
+
+ // Ignore non-void* NULL arg
+ CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
+ CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
+ CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
+ CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL));
+ CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
+ CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
+ CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
+
+ // Fail on non-void* NULL arg if the match doesn't parse for the given type.
+ CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
+ CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL));
+ CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
+ CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL));
+ CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
+}
+
+TEST(RE2, FullMatchTypeTests) {
+ // Type tests
+ {
+ char c;
+ CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
+ CHECK_EQ(c, 'H');
+ }
+ {
+ unsigned char c;
+ CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
+ CHECK_EQ(c, static_cast<unsigned char>('H'));
+ }
+ {
+ int16 v;
+ CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("32767", "(-?\\d+)", &v)); CHECK_EQ(v, 32767);
+ CHECK(RE2::FullMatch("-32768", "(-?\\d+)", &v)); CHECK_EQ(v, -32768);
+ CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v));
+ CHECK(!RE2::FullMatch("32768", "(-?\\d+)", &v));
+ }
+ {
+ uint16 v;
+ CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("32767", "(\\d+)", &v)); CHECK_EQ(v, 32767);
+ CHECK(RE2::FullMatch("65535", "(\\d+)", &v)); CHECK_EQ(v, 65535);
+ CHECK(!RE2::FullMatch("65536", "(\\d+)", &v));
+ }
+ {
+ int32 v;
+ static const int32 max_value = 0x7fffffff;
+ static const int32 min_value = -max_value - 1;
+ CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100);
+ CHECK(RE2::FullMatch("2147483647", "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+ CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
+ CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)",&v));
+ CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)",&v));
+ }
+ {
+ uint32 v;
+ static const uint32 max_value = 0xfffffffful;
+ CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max_value);
+ CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
+ }
+ {
+ int64 v;
+ static const int64 max_value = 0x7fffffffffffffffull;
+ static const int64 min_value = -max_value - 1;
+ char buf[32];
+
+ CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v)); CHECK_EQ(v, -100);
+
+ snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+
+ snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, min_value);
+
+ snprintf(buf, sizeof(buf), "%lld", (long long)max_value);
+ assert(buf[strlen(buf)-1] != '9');
+ buf[strlen(buf)-1]++;
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+
+ snprintf(buf, sizeof(buf), "%lld", (long long)min_value);
+ assert(buf[strlen(buf)-1] != '9');
+ buf[strlen(buf)-1]++;
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+ }
+ {
+ uint64 v;
+ int64 v2;
+ static const uint64 max_value = 0xffffffffffffffffull;
+ char buf[32];
+
+ CHECK(RE2::FullMatch("100", "(-?\\d+)",&v)); CHECK_EQ(v, 100);
+ CHECK(RE2::FullMatch("-100", "(-?\\d+)",&v2)); CHECK_EQ(v2, -100);
+
+ snprintf(buf, sizeof(buf), "%llu", (unsigned long long)max_value);
+ CHECK(RE2::FullMatch(buf, "(-?\\d+)",&v)); CHECK_EQ(v, max_value);
+
+ assert(buf[strlen(buf)-1] != '9');
+ buf[strlen(buf)-1]++;
+ CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
+ }
+ {
+ float v;
+ CHECK(RE2::FullMatch("100", "(.*)", &v));
+ CHECK(RE2::FullMatch("-100.", "(.*)", &v));
+ CHECK(RE2::FullMatch("1e23", "(.*)", &v));
+ }
+ {
+ double v;
+ CHECK(RE2::FullMatch("100", "(.*)", &v));
+ CHECK(RE2::FullMatch("-100.", "(.*)", &v));
+ CHECK(RE2::FullMatch("1e23", "(.*)", &v));
+ }
+}
+
+TEST(RE2, FullMatchAnchored) {
+ int i;
+ // Check that matching is fully anchored
+ CHECK(!RE2::FullMatch("x1001", "(\\d+)", &i));
+ CHECK(!RE2::FullMatch("1001x", "(\\d+)", &i));
+ CHECK(RE2::FullMatch("x1001", "x(\\d+)", &i)); CHECK_EQ(i, 1001);
+ CHECK(RE2::FullMatch("1001x", "(\\d+)x", &i)); CHECK_EQ(i, 1001);
+}
+
+TEST(RE2, FullMatchBraces) {
+ // Braces
+ CHECK(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}"));
+ CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
+ CHECK(!RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}"));
+}
+
+TEST(RE2, Complicated) {
+ // Complicated RE2
+ CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
+ CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
+ CHECK(RE2::FullMatch("X", "foo|bar|[A-Z]"));
+ CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]"));
+}
+
+TEST(RE2, FullMatchEnd) {
+ // Check full-match handling (needs '$' tacked on internally)
+ CHECK(RE2::FullMatch("fo", "fo|foo"));
+ CHECK(RE2::FullMatch("foo", "fo|foo"));
+ CHECK(RE2::FullMatch("fo", "fo|foo$"));
+ CHECK(RE2::FullMatch("foo", "fo|foo$"));
+ CHECK(RE2::FullMatch("foo", "foo$"));
+ CHECK(!RE2::FullMatch("foo$bar", "foo\\$"));
+ CHECK(!RE2::FullMatch("fox", "fo|bar"));
+
+ // Uncomment the following if we change the handling of '$' to
+ // prevent it from matching a trailing newline
+ if (false) {
+ // Check that we don't get bitten by pcre's special handling of a
+ // '\n' at the end of the string matching '$'
+ CHECK(!RE2::PartialMatch("foo\n", "foo$"));
+ }
+}
+
+TEST(RE2, FullMatchArgCount) {
+ // Number of args
+ int a[16];
+ CHECK(RE2::FullMatch("", ""));
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("1",
+ "(\\d){1}",
+ &a[0]));
+ CHECK_EQ(a[0], 1);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("12",
+ "(\\d)(\\d)",
+ &a[0], &a[1]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("123",
+ "(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("1234",
+ "(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+ CHECK_EQ(a[3], 4);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("12345",
+ "(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3],
+ &a[4]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+ CHECK_EQ(a[3], 4);
+ CHECK_EQ(a[4], 5);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("123456",
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+ CHECK_EQ(a[3], 4);
+ CHECK_EQ(a[4], 5);
+ CHECK_EQ(a[5], 6);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("1234567",
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+ CHECK_EQ(a[3], 4);
+ CHECK_EQ(a[4], 5);
+ CHECK_EQ(a[5], 6);
+ CHECK_EQ(a[6], 7);
+
+ memset(a, 0, sizeof(0));
+ CHECK(RE2::FullMatch("1234567890123456",
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
+ "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
+ &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7],
+ &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]));
+ CHECK_EQ(a[0], 1);
+ CHECK_EQ(a[1], 2);
+ CHECK_EQ(a[2], 3);
+ CHECK_EQ(a[3], 4);
+ CHECK_EQ(a[4], 5);
+ CHECK_EQ(a[5], 6);
+ CHECK_EQ(a[6], 7);
+ CHECK_EQ(a[7], 8);
+ CHECK_EQ(a[8], 9);
+ CHECK_EQ(a[9], 0);
+ CHECK_EQ(a[10], 1);
+ CHECK_EQ(a[11], 2);
+ CHECK_EQ(a[12], 3);
+ CHECK_EQ(a[13], 4);
+ CHECK_EQ(a[14], 5);
+ CHECK_EQ(a[15], 6);
+}
+
+TEST(RE2, Accessors) {
+ // Check the pattern() accessor
+ {
+ const string kPattern = "http://([^/]+)/.*";
+ const RE2 re(kPattern);
+ CHECK_EQ(kPattern, re.pattern());
+ }
+
+ // Check RE2 error field.
+ {
+ RE2 re("foo");
+ CHECK(re.error().empty()); // Must have no error
+ CHECK(re.ok());
+ CHECK(re.error_code() == RE2::NoError);
+ }
+}
+
+TEST(RE2, UTF8) {
+ // Check UTF-8 handling
+ // Three Japanese characters (nihongo)
+ const char utf8_string[] = {
+ 0xe6, 0x97, 0xa5, // 65e5
+ 0xe6, 0x9c, 0xac, // 627c
+ 0xe8, 0xaa, 0x9e, // 8a9e
+ 0
+ };
+ const char utf8_pattern[] = {
+ '.',
+ 0xe6, 0x9c, 0xac, // 627c
+ '.',
+ 0
+ };
+
+ // Both should match in either mode, bytes or UTF-8
+ RE2 re_test1(".........", RE2::Latin1);
+ CHECK(RE2::FullMatch(utf8_string, re_test1));
+ RE2 re_test2("...");
+ CHECK(RE2::FullMatch(utf8_string, re_test2));
+
+ // Check that '.' matches one byte or UTF-8 character
+ // according to the mode.
+ string s;
+ RE2 re_test3("(.)", RE2::Latin1);
+ CHECK(RE2::PartialMatch(utf8_string, re_test3, &s));
+ CHECK_EQ(s, string("\xe6"));
+ RE2 re_test4("(.)");
+ CHECK(RE2::PartialMatch(utf8_string, re_test4, &s));
+ CHECK_EQ(s, string("\xe6\x97\xa5"));
+
+ // Check that string matches itself in either mode
+ RE2 re_test5(utf8_string, RE2::Latin1);
+ CHECK(RE2::FullMatch(utf8_string, re_test5));
+ RE2 re_test6(utf8_string);
+ CHECK(RE2::FullMatch(utf8_string, re_test6));
+
+ // Check that pattern matches string only in UTF8 mode
+ RE2 re_test7(utf8_pattern, RE2::Latin1);
+ CHECK(!RE2::FullMatch(utf8_string, re_test7));
+ RE2 re_test8(utf8_pattern);
+ CHECK(RE2::FullMatch(utf8_string, re_test8));
+}
+
+TEST(RE2, UngreedyUTF8) {
+ // Check that ungreedy, UTF8 regular expressions don't match when they
+ // oughtn't -- see bug 82246.
+ {
+ // This code always worked.
+ const char* pattern = "\\w+X";
+ const string target = "a aX";
+ RE2 match_sentence(pattern, RE2::Latin1);
+ RE2 match_sentence_re(pattern);
+
+ CHECK(!RE2::FullMatch(target, match_sentence));
+ CHECK(!RE2::FullMatch(target, match_sentence_re));
+ }
+ {
+ const char* pattern = "(?U)\\w+X";
+ const string target = "a aX";
+ RE2 match_sentence(pattern, RE2::Latin1);
+ CHECK_EQ(match_sentence.error(), "");
+ RE2 match_sentence_re(pattern);
+
+ CHECK(!RE2::FullMatch(target, match_sentence));
+ CHECK(!RE2::FullMatch(target, match_sentence_re));
+ }
+}
+
+TEST(RE2, Rejects) {
+ { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); }
+ {
+ RE2 re("a[x", RE2::Quiet);
+ CHECK(!re.ok());
+ }
+ {
+ RE2 re("a[z-a]", RE2::Quiet);
+ CHECK(!re.ok());
+ }
+ {
+ RE2 re("a[[:foobar:]]", RE2::Quiet);
+ CHECK(!re.ok());
+ }
+ {
+ RE2 re("a(b", RE2::Quiet);
+ CHECK(!re.ok());
+ }
+ {
+ RE2 re("a\\", RE2::Quiet);
+ CHECK(!re.ok());
+ }
+}
+
+TEST(RE2, NoCrash) {
+ // Test that using a bad regexp doesn't crash.
+ {
+ RE2 re("a\\", RE2::Quiet);
+ CHECK(!re.ok());
+ CHECK(!RE2::PartialMatch("a\\b", re));
+ }
+
+ // Test that using an enormous regexp doesn't crash
+ {
+ RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
+ CHECK(!re.ok());
+ CHECK(!RE2::PartialMatch("aaa", re));
+ }
+
+ // Test that a crazy regexp still compiles and runs.
+ {
+ RE2 re(".{512}x", RE2::Quiet);
+ CHECK(re.ok());
+ string s;
+ s.append(515, 'c');
+ s.append("x");
+ LOG(INFO) << "Expect to see DFA out of memory print...";
+ CHECK(RE2::PartialMatch(s, re));
+ }
+}
+
+TEST(RE2, Recursion) {
+ // Test that recursion is stopped.
+ // This test is PCRE-legacy -- there's no recursion in RE2.
+ int bytes = 15 * 1024; // enough to crash PCRE
+ TestRecursion(bytes, ".");
+ TestRecursion(bytes, "a");
+ TestRecursion(bytes, "a.");
+ TestRecursion(bytes, "ab.");
+ TestRecursion(bytes, "abc.");
+}
+
+TEST(RE2, BigCountedRepetition) {
+ // Test that counted repetition works, given tons of memory.
+ RE2::Options opt;
+ opt.set_max_mem(256<<20);
+
+ RE2 re(".{512}x", opt);
+ CHECK(re.ok());
+ string s;
+ s.append(515, 'c');
+ s.append("x");
+ CHECK(RE2::PartialMatch(s, re));
+}
+
+TEST(RE2, DeepRecursion) {
+ // Test for deep stack recursion. This would fail with a
+ // segmentation violation due to stack overflow before pcre was
+ // patched.
+ // Again, a PCRE legacy test. RE2 doesn't recurse.
+ rlimit rl;
+ rl.rlim_max = rl.rlim_cur = 1 * 1024 * 1024;
+ CHECK_EQ(0, setrlimit(RLIMIT_STACK, &rl));
+ string comment("/*");
+ string a(16384, 'a');
+ comment += a;
+ comment += "*/";
+ RE2 re("((?:\\s|//.*\n|/[*](?:\n|.)*?[*]/)*)");
+ CHECK(RE2::FullMatch(comment, re));
+}
+
+// Suggested by Josh Hyman. Failed when SearchOnePass was
+// not implementing case-folding.
+TEST(CaseInsensitive, MatchAndConsume) {
+ string result;
+ string text = "A fish named *Wanda*";
+ StringPiece sp(text);
+
+ EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
+ EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
+}
+
+// RE2 should permit implicit conversions from string, StringPiece, const char*,
+// and C string literals.
+TEST(RE2, ImplicitConversions) {
+ string re_string(".");
+ StringPiece re_stringpiece(".");
+ const char* re_cstring = ".";
+ EXPECT_TRUE(RE2::PartialMatch("e", re_string));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
+ EXPECT_TRUE(RE2::PartialMatch("e", "."));
+}
+
+// Bugs introduced by 8622304
+TEST(RE2, CL8622304) {
+ // reported by ingow
+ string dir;
+ EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok
+ EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails
+
+ // reported by jacobsa
+ string key, val;
+ EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
+ "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
+ &key,
+ &val));
+ EXPECT_EQ(key, "bar");
+ EXPECT_EQ(val, "1,0x2F,030,4,5");
+}
+
+
+// Check that RE2 returns correct regexp pieces on error.
+// In particular, make sure it returns whole runes
+// and that it always reports invalid UTF-8.
+// Also check that Perl error flag piece is big enough.
+static struct ErrorTest {
+ const char *regexp;
+ const char *error;
+} error_tests[] = {
+ { "ab\\αcd", "\\α" },
+ { "ef\\x☺01", "\\x☺0" },
+ { "gh\\x1☺01", "\\x1☺" },
+ { "ij\\x1", "\\x1" },
+ { "kl\\x", "\\x" },
+ { "uv\\x{0000☺}", "\\x{0000☺" },
+ { "wx\\p{ABC", "\\p{ABC" },
+ { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X
+ { "aa(?sm☺i", "(?sm☺" },
+ { "bb[abc", "[abc" },
+
+ { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8
+ { "op\377qr", "" },
+ { "st\\x{00000\377", "" },
+ { "zz\\p{\377}", "" },
+ { "zz\\x{00\377}", "" },
+ { "zz(?P<name\377>abc)", "" },
+};
+TEST(RE2, ErrorArgs) {
+ for (int i = 0; i < arraysize(error_tests); i++) {
+ RE2 re(error_tests[i].regexp, RE2::Quiet);
+ EXPECT_FALSE(re.ok());
+ EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
+ }
+}
+
+// Check that "never match \n" mode never matches \n.
+static struct NeverTest {
+ const char* regexp;
+ const char* text;
+ const char* match;
+} never_tests[] = {
+ { "(.*)", "abc\ndef\nghi\n", "abc" },
+ { "(?s)(abc.*def)", "abc\ndef\n", NULL },
+ { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
+ { "(abc[^x]*def)", "abc\ndef\n", NULL },
+ { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
+};
+TEST(RE2, NeverNewline) {
+ RE2::Options opt;
+ opt.set_never_nl(true);
+ for (int i = 0; i < arraysize(never_tests); i++) {
+ const NeverTest& t = never_tests[i];
+ RE2 re(t.regexp, opt);
+ if (t.match == NULL) {
+ EXPECT_FALSE(re.PartialMatch(t.text, re));
+ } else {
+ StringPiece m;
+ EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
+ EXPECT_EQ(m, t.match);
+ }
+ }
+}
+
+// Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
+// Triggered by a failed DFA search falling back to Bitstate when
+// using Match with a NULL submatch set. Bitstate tried to read
+// the submatch[0] entry even if nsubmatch was 0.
+TEST(RE2, BitstateCaptureBug) {
+ RE2::Options opt;
+ opt.set_max_mem(20000);
+ RE2 re("(_________$)", opt);
+ EXPECT_FALSE(re.Match(
+ "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x",
+ 0, RE2::UNANCHORED, NULL, 0));
+}
+
+// C++ version of bug 609710.
+TEST(RE2, UnicodeClasses) {
+ const string str = "ABCDEFGHIèšæ°¸é‹’";
+ string a, b, c;
+
+ EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
+ EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
+ EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("èš", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("èš", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("èš", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("èš", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("èš", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("èš", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("æ°¸", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("æ°¸", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("æ°¸", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("æ°¸", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("æ°¸", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("æ°¸", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::FullMatch("é‹’", "\\p{L}"));
+ EXPECT_FALSE(RE2::FullMatch("é‹’", "\\p{Lu}"));
+ EXPECT_FALSE(RE2::FullMatch("é‹’", "\\p{Ll}"));
+ EXPECT_FALSE(RE2::FullMatch("é‹’", "\\P{L}"));
+ EXPECT_TRUE(RE2::FullMatch("é‹’", "\\P{Lu}"));
+ EXPECT_TRUE(RE2::FullMatch("é‹’", "\\P{Ll}"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("A", a);
+ EXPECT_EQ("B", b);
+ EXPECT_EQ("C", c);
+
+ EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
+
+ EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
+ EXPECT_EQ("èš", a);
+ EXPECT_EQ("æ°¸", b);
+ EXPECT_EQ("é‹’", c);
+}
+
+// Bug reported by saito. 2009/02/17
+TEST(RE2, NullVsEmptyString) {
+ RE2 re2(".*");
+ StringPiece v1("");
+ EXPECT_TRUE(RE2::FullMatch(v1, re2));
+
+ StringPiece v2;
+ EXPECT_TRUE(RE2::FullMatch(v2, re2));
+}
+
+// Issue 1816809
+TEST(RE2, Bug1816809) {
+ RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
+ StringPiece piece("llx-3;llx4");
+ string x;
+ CHECK(RE2::Consume(&piece, re, &x));
+}
+
+} // namespace re2
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
new file mode 100644
index 0000000..1a97129
--- /dev/null
+++ b/re2/testing/regexp_benchmark.cc
@@ -0,0 +1,1461 @@
+// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Benchmarks for regular expression implementations.
+
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "util/pcre.h"
+#include "util/benchmark.h"
+
+namespace re2 {
+void Test();
+void MemoryUsage();
+} // namespace re2
+
+typedef testing::MallocCounter MallocCounter;
+
+namespace re2 {
+
+void Test() {
+ Regexp* re = Regexp::Parse("(\\d+)-(\\d+)-(\\d+)", Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ const char* text = "650-253-0001";
+ StringPiece sp[4];
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ CHECK_EQ(sp[0], "650-253-0001");
+ CHECK_EQ(sp[1], "650");
+ CHECK_EQ(sp[2], "253");
+ CHECK_EQ(sp[3], "0001");
+ delete prog;
+ re->Decref();
+ LOG(INFO) << "test passed\n";
+}
+
+void MemoryUsage() {
+ const char* regexp = "(\\d+)-(\\d+)-(\\d+)";
+ const char* text = "650-253-0001";
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly,
+ // because LOG(INFO) might do a big allocation before they get evaluated.
+ fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ mc.Reset();
+
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ mc.Reset();
+
+ StringPiece sp[4];
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ delete prog;
+ re->Decref();
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ PCRE re(regexp, PCRE::UTF8);
+ fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ PCRE::FullMatch(text, re);
+ fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ PCRE* re = new PCRE(regexp, PCRE::UTF8);
+ fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ PCRE::FullMatch(text, *re);
+ fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ delete re;
+ }
+
+ {
+ MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
+
+ RE2 re(regexp);
+ fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ RE2::FullMatch(text, re);
+ fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth());
+ }
+
+ fprintf(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n",
+ static_cast<int>(sizeof(PCRE)),
+ static_cast<int>(sizeof(RE2)),
+ static_cast<int>(sizeof(Prog)),
+ static_cast<int>(sizeof(Inst)));
+}
+
+// Regular expression implementation wrappers.
+// Defined at bottom of file, but they are repetitive
+// and not interesting.
+
+typedef void SearchImpl(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match);
+
+SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState,
+ SearchPCRE, SearchRE2,
+ SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, SearchCachedBitState,
+ SearchCachedPCRE, SearchCachedRE2;
+
+typedef void ParseImpl(int iters, const char* regexp, const StringPiece& text);
+
+ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState,
+ Parse1PCRE, Parse1RE2,
+ Parse1Backtrack,
+ Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState,
+ Parse1CachedPCRE, Parse1CachedRE2,
+ Parse1CachedBacktrack;
+
+ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState,
+ Parse3PCRE, Parse3RE2,
+ Parse3Backtrack,
+ Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState,
+ Parse3CachedPCRE, Parse3CachedRE2,
+ Parse3CachedBacktrack;
+
+ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2;
+
+ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2;
+
+// Benchmark: failed search for regexp in random text.
+
+// Generate random text that won't contain the search string,
+// to test worst-case search behavior.
+void MakeText(string* text, int nbytes) {
+ text->resize(nbytes);
+ srand(0);
+ for (int i = 0; i < nbytes; i++) {
+ if (!rand()%30)
+ (*text)[i] = '\n';
+ else
+ (*text)[i] = rand()%(0x7E + 1 - 0x20)+0x20;
+ }
+}
+
+// Makes text of size nbytes, then calls run to search
+// the text for regexp iters times.
+void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, nbytes);
+ BenchmarkMemoryUsage();
+ StartBenchmarkTiming();
+ search(iters, regexp, s, Prog::kUnanchored, false);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+// These two are easy because they start with an A,
+// giving the search loop something to memchr for.
+#define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+#define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
+
+// This is a little harder, since it starts with a character class
+// and thus can't be memchr'ed. Could look for ABC and work backward,
+// but no one does that.
+#define MEDIUM "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+
+// This is a fair amount harder, because of the leading [ -~]*.
+// A bad backtracking implementation will take O(text^2) time to
+// figure out there's no match.
+#define HARD "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
+
+// This stresses engines that are trying to track parentheses.
+#define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \
+ "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
+
+void Search_Easy0_CachedDFA(int i, int n) { Search(i, n, EASY0, SearchCachedDFA); }
+void Search_Easy0_CachedNFA(int i, int n) { Search(i, n, EASY0, SearchCachedNFA); }
+void Search_Easy0_CachedPCRE(int i, int n) { Search(i, n, EASY0, SearchCachedPCRE); }
+void Search_Easy0_CachedRE2(int i, int n) { Search(i, n, EASY0, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Easy1_CachedDFA(int i, int n) { Search(i, n, EASY1, SearchCachedDFA); }
+void Search_Easy1_CachedNFA(int i, int n) { Search(i, n, EASY1, SearchCachedNFA); }
+void Search_Easy1_CachedPCRE(int i, int n) { Search(i, n, EASY1, SearchCachedPCRE); }
+void Search_Easy1_CachedRE2(int i, int n) { Search(i, n, EASY1, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Medium_CachedDFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedDFA); }
+void Search_Medium_CachedNFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedNFA); }
+void Search_Medium_CachedPCRE(int i, int n) { Search(i, n, MEDIUM, SearchCachedPCRE); }
+void Search_Medium_CachedRE2(int i, int n) { Search(i, n, MEDIUM, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Hard_CachedDFA(int i, int n) { Search(i, n, HARD, SearchCachedDFA); }
+void Search_Hard_CachedNFA(int i, int n) { Search(i, n, HARD, SearchCachedNFA); }
+void Search_Hard_CachedPCRE(int i, int n) { Search(i, n, HARD, SearchCachedPCRE); }
+void Search_Hard_CachedRE2(int i, int n) { Search(i, n, HARD, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Parens_CachedDFA(int i, int n) { Search(i, n, PARENS, SearchCachedDFA); }
+void Search_Parens_CachedNFA(int i, int n) { Search(i, n, PARENS, SearchCachedNFA); }
+void Search_Parens_CachedPCRE(int i, int n) { Search(i, n, PARENS, SearchCachedPCRE); }
+void Search_Parens_CachedRE2(int i, int n) { Search(i, n, PARENS, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void SearchBigFixed(int iters, int nbytes, SearchImpl* search) {
+ StopBenchmarkTiming();
+ string s;
+ s.append(nbytes/2, 'x');
+ string regexp = "^" + s + ".*$";
+ string t;
+ MakeText(&t, nbytes/2);
+ s += t;
+ BenchmarkMemoryUsage();
+ StartBenchmarkTiming();
+ search(iters, regexp.c_str(), s, Prog::kUnanchored, true);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+void Search_BigFixed_CachedDFA(int i, int n) { SearchBigFixed(i, n, SearchCachedDFA); }
+void Search_BigFixed_CachedNFA(int i, int n) { SearchBigFixed(i, n, SearchCachedNFA); }
+void Search_BigFixed_CachedPCRE(int i, int n) { SearchBigFixed(i, n, SearchCachedPCRE); }
+void Search_BigFixed_CachedRE2(int i, int n) { SearchBigFixed(i, n, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: FindAndConsume
+void FindAndConsume(int iters, int nbytes) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, nbytes);
+ s.append("Hello World");
+ StartBenchmarkTiming();
+ RE2 re("((Hello World))");
+ for (int i = 0; i < iters; i++) {
+ StringPiece t = s;
+ StringPiece u;
+ CHECK(RE2::FindAndConsume(&t, re, &u));
+ CHECK_EQ(u, "Hello World");
+ }
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: successful anchored search.
+
+void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) {
+ string s;
+ MakeText(&s, nbytes);
+ BenchmarkMemoryUsage();
+ search(iters, regexp, s, Prog::kAnchored, true);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+// Unambiguous search (RE2 can use OnePass).
+
+void Search_Success_DFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchDFA); }
+void Search_Success_OnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchOnePass); }
+void Search_Success_PCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchPCRE); }
+void Search_Success_RE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchRE2); }
+
+BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Success_CachedDFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedDFA); }
+void Search_Success_CachedOnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedOnePass); }
+void Search_Success_CachedPCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedPCRE); }
+void Search_Success_CachedRE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+// Ambiguous search (RE2 cannot use OnePass).
+
+void Search_Success1_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchDFA); }
+void Search_Success1_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchPCRE); }
+void Search_Success1_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchRE2); }
+void Search_Success1_BitState(int i, int n) { SearchSuccess(i, n, ".*.$", SearchBitState); }
+
+BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
+
+void Search_Success1_Cached_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedDFA); }
+void Search_Success1_Cached_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedPCRE); }
+void Search_Success1_Cached_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Success1_Cached_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Success1_Cached_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+// Benchmark: use regexp to find phone number.
+
+void SearchDigits(int iters, SearchImpl* search) {
+ const char *text = "650-253-0001";
+ int len = strlen(text);
+ BenchmarkMemoryUsage();
+ search(iters, "([0-9]+)-([0-9]+)-([0-9]+)",
+ StringPiece(text, len), Prog::kAnchored, true);
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Search_Digits_DFA(int i) { SearchDigits(i, SearchDFA); }
+void Search_Digits_NFA(int i) { SearchDigits(i, SearchNFA); }
+void Search_Digits_OnePass(int i) { SearchDigits(i, SearchOnePass); }
+void Search_Digits_PCRE(int i) { SearchDigits(i, SearchPCRE); }
+void Search_Digits_RE2(int i) { SearchDigits(i, SearchRE2); }
+void Search_Digits_BitState(int i) { SearchDigits(i, SearchBitState); }
+
+BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Search_Digits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Search_Digits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: use regexp to parse digit fields in phone number.
+
+void Parse3Digits(int iters,
+ void (*parse3)(int, const char*, const StringPiece&)) {
+ BenchmarkMemoryUsage();
+ parse3(iters, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001");
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_Digits_NFA(int i) { Parse3Digits(i, Parse3NFA); }
+void Parse_Digits_OnePass(int i) { Parse3Digits(i, Parse3OnePass); }
+void Parse_Digits_PCRE(int i) { Parse3Digits(i, Parse3PCRE); }
+void Parse_Digits_RE2(int i) { Parse3Digits(i, Parse3RE2); }
+void Parse_Digits_Backtrack(int i) { Parse3Digits(i, Parse3Backtrack); }
+void Parse_Digits_BitState(int i) { Parse3Digits(i, Parse3BitState); }
+
+BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_Digits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedDigits_NFA(int i) { Parse3Digits(i, Parse3CachedNFA); }
+void Parse_CachedDigits_OnePass(int i) { Parse3Digits(i, Parse3CachedOnePass); }
+void Parse_CachedDigits_PCRE(int i) { Parse3Digits(i, Parse3CachedPCRE); }
+void Parse_CachedDigits_RE2(int i) { Parse3Digits(i, Parse3CachedRE2); }
+void Parse_CachedDigits_Backtrack(int i) { Parse3Digits(i, Parse3CachedBacktrack); }
+void Parse_CachedDigits_BitState(int i) { Parse3Digits(i, Parse3CachedBitState); }
+
+BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedDigits_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse3DigitDs(int iters,
+ void (*parse3)(int, const char*, const StringPiece&)) {
+ BenchmarkMemoryUsage();
+ parse3(iters, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001");
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_DigitDs_NFA(int i) { Parse3DigitDs(i, Parse3NFA); }
+void Parse_DigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3OnePass); }
+void Parse_DigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3PCRE); }
+void Parse_DigitDs_RE2(int i) { Parse3DigitDs(i, Parse3RE2); }
+void Parse_DigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); }
+void Parse_DigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); }
+
+BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_DigitDs_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedDigitDs_NFA(int i) { Parse3DigitDs(i, Parse3CachedNFA); }
+void Parse_CachedDigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3CachedOnePass); }
+void Parse_CachedDigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3CachedPCRE); }
+void Parse_CachedDigitDs_RE2(int i) { Parse3DigitDs(i, Parse3CachedRE2); }
+void Parse_CachedDigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); }
+void Parse_CachedDigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); }
+
+BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedDigitDs_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedDigitDs_Backtrack)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: splitting off leading number field.
+
+void Parse1Split(int iters,
+ void (*parse1)(int, const char*, const StringPiece&)) {
+ BenchmarkMemoryUsage();
+ parse1(iters, "[0-9]+-(.*)", "650-253-0001");
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_Split_NFA(int i) { Parse1Split(i, Parse1NFA); }
+void Parse_Split_OnePass(int i) { Parse1Split(i, Parse1OnePass); }
+void Parse_Split_PCRE(int i) { Parse1Split(i, Parse1PCRE); }
+void Parse_Split_RE2(int i) { Parse1Split(i, Parse1RE2); }
+void Parse_Split_BitState(int i) { Parse1Split(i, Parse1BitState); }
+
+BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedSplit_NFA(int i) { Parse1Split(i, Parse1CachedNFA); }
+void Parse_CachedSplit_OnePass(int i) { Parse1Split(i, Parse1CachedOnePass); }
+void Parse_CachedSplit_PCRE(int i) { Parse1Split(i, Parse1CachedPCRE); }
+void Parse_CachedSplit_RE2(int i) { Parse1Split(i, Parse1CachedRE2); }
+void Parse_CachedSplit_BitState(int i) { Parse1Split(i, Parse1CachedBitState); }
+
+BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplit_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplit_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs());
+
+// Benchmark: splitting off leading number field but harder (ambiguous regexp).
+
+void Parse1SplitHard(int iters,
+ void (*run)(int, const char*, const StringPiece&)) {
+ BenchmarkMemoryUsage();
+ run(iters, "[0-9]+.(.*)", "650-253-0001");
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_SplitHard_NFA(int i) { Parse1SplitHard(i, Parse1NFA); }
+void Parse_SplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1PCRE); }
+void Parse_SplitHard_RE2(int i) { Parse1SplitHard(i, Parse1RE2); }
+void Parse_SplitHard_BitState(int i) { Parse1SplitHard(i, Parse1BitState); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs());
+
+void Parse_CachedSplitHard_NFA(int i) { Parse1SplitHard(i, Parse1CachedNFA); }
+void Parse_CachedSplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1CachedPCRE); }
+void Parse_CachedSplitHard_RE2(int i) { Parse1SplitHard(i, Parse1CachedRE2); }
+void Parse_CachedSplitHard_BitState(int i) { Parse1SplitHard(i, Parse1CachedBitState); }
+void Parse_CachedSplitHard_Backtrack(int i) { Parse1SplitHard(i, Parse1CachedBacktrack); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitHard_RE2)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_BitState)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_NFA)->ThreadRange(1, NumCPUs());
+BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs());
+
+// Benchmark: Parse1SplitHard, big text, small match.
+
+void Parse1SplitBig1(int iters,
+ void (*run)(int, const char*, const StringPiece&)) {
+ string s;
+ s.append(100000, 'x');
+ s.append("650-253-0001");
+ BenchmarkMemoryUsage();
+ run(iters, "[0-9]+.(.*)", s);
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_CachedSplitBig1_PCRE(int i) { Parse1SplitBig1(i, SearchParse1CachedPCRE); }
+void Parse_CachedSplitBig1_RE2(int i) { Parse1SplitBig1(i, SearchParse1CachedRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs());
+
+// Benchmark: Parse1SplitHard, big text, big match.
+
+void Parse1SplitBig2(int iters,
+ void (*run)(int, const char*, const StringPiece&)) {
+ string s;
+ s.append("650-253-");
+ s.append(100000, '0');
+ BenchmarkMemoryUsage();
+ run(iters, "[0-9]+.(.*)", s);
+ SetBenchmarkItemsProcessed(iters);
+}
+
+void Parse_CachedSplitBig2_PCRE(int i) { Parse1SplitBig2(i, SearchParse1CachedPCRE); }
+void Parse_CachedSplitBig2_RE2(int i) { Parse1SplitBig2(i, SearchParse1CachedRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs());
+
+// Benchmark: measure time required to parse (but not execute)
+// a simple regular expression.
+
+void ParseRegexp(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ re->Decref();
+ }
+}
+
+void SimplifyRegexp(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Regexp* sre = re->Simplify();
+ CHECK(sre);
+ sre->Decref();
+ re->Decref();
+ }
+}
+
+void NullWalkRegexp(int iters, const string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ for (int i = 0; i < iters; i++) {
+ re->NullWalk();
+ }
+ re->Decref();
+}
+
+void SimplifyCompileRegexp(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Regexp* sre = re->Simplify();
+ CHECK(sre);
+ Prog* prog = sre->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ sre->Decref();
+ re->Decref();
+ }
+}
+
+void CompileRegexp(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void CompileToProg(int iters, const string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ for (int i = 0; i < iters; i++) {
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ delete prog;
+ }
+ re->Decref();
+}
+
+void CompileByteMap(int iters, const string& regexp) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ for (int i = 0; i < iters; i++) {
+ prog->ComputeByteMap();
+ }
+ delete prog;
+ re->Decref();
+}
+
+void CompilePCRE(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ }
+}
+
+void CompileRE2(int iters, const string& regexp) {
+ for (int i = 0; i < iters; i++) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ }
+}
+
+void RunBuild(int iters, const string& regexp, void (*run)(int, const string&)) {
+ run(iters, regexp);
+ SetBenchmarkItemsProcessed(iters);
+}
+
+} // namespace re2
+
+DEFINE_string(compile_regexp, "(.*)-(\\d+)-of-(\\d+)", "regexp for compile benchmarks");
+
+namespace re2 {
+
+void BM_PCRE_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompilePCRE); }
+void BM_Regexp_Parse(int i) { RunBuild(i, FLAGS_compile_regexp, ParseRegexp); }
+void BM_Regexp_Simplify(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyRegexp); }
+void BM_CompileToProg(int i) { RunBuild(i, FLAGS_compile_regexp, CompileToProg); }
+void BM_CompileByteMap(int i) { RunBuild(i, FLAGS_compile_regexp, CompileByteMap); }
+void BM_Regexp_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRegexp); }
+void BM_Regexp_SimplifyCompile(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyCompileRegexp); }
+void BM_Regexp_NullWalk(int i) { RunBuild(i, FLAGS_compile_regexp, NullWalkRegexp); }
+void BM_RE2_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRE2); }
+
+#ifdef USEPCRE
+BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(BM_Regexp_Parse)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_Simplify)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_CompileToProg)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_CompileByteMap)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_Compile)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs());
+BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs());
+
+
+// Makes text of size nbytes, then calls run to search
+// the text for regexp iters times.
+void SearchPhone(int iters, int nbytes, ParseImpl* search) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, nbytes);
+ s.append("(650) 253-0001");
+ BenchmarkMemoryUsage();
+ StartBenchmarkTiming();
+ search(iters, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+void SearchPhone_CachedPCRE(int i, int n) {
+ SearchPhone(i, n, SearchParse2CachedPCRE);
+}
+void SearchPhone_CachedRE2(int i, int n) {
+ SearchPhone(i, n, SearchParse2CachedRE2);
+}
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(SearchPhone_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+/*
+TODO(rsc): Make this work again.
+
+// Generates and returns a string over binary alphabet {0,1} that contains
+// all possible binary sequences of length n as subsequences. The obvious
+// brute force method would generate a string of length n * 2^n, but this
+// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
+// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
+static string DeBruijnString(int n) {
+ CHECK_LT(n, 8*sizeof(int));
+ CHECK_GT(n, 0);
+
+ vector<bool> did(1<<n);
+ for (int i = 0; i < 1<<n; i++)
+ did[i] = false;
+
+ string s;
+ for (int i = 0; i < n-1; i++)
+ s.append("0");
+ int bits = 0;
+ int mask = (1<<n) - 1;
+ for (int i = 0; i < (1<<n); i++) {
+ bits <<= 1;
+ bits &= mask;
+ if (!did[bits|1]) {
+ bits |= 1;
+ s.append("1");
+ } else {
+ s.append("0");
+ }
+ CHECK(!did[bits]);
+ did[bits] = true;
+ }
+ return s;
+}
+
+void CacheFill(int iters, int n, SearchImpl *srch) {
+ string s = DeBruijnString(n+1);
+ string t;
+ for (int i = n+1; i < 20; i++) {
+ t = s + s;
+ swap(s, t);
+ }
+ srch(iters, StringPrintf("0[01]{%d}$", n).c_str(), s,
+ Prog::kUnanchored, true);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*s.size());
+}
+
+void CacheFillPCRE(int i, int n) { CacheFill(i, n, SearchCachedPCRE); }
+void CacheFillRE2(int i, int n) { CacheFill(i, n, SearchCachedRE2); }
+void CacheFillNFA(int i, int n) { CacheFill(i, n, SearchCachedNFA); }
+void CacheFillDFA(int i, int n) { CacheFill(i, n, SearchCachedDFA); }
+
+// BENCHMARK_WITH_ARG uses __LINE__ to generate distinct identifiers
+// for the static BenchmarkRegisterer, which makes it unusable inside
+// a macro like DO24 below. MY_BENCHMARK_WITH_ARG uses the argument a
+// to make the identifiers distinct (only possible when 'a' is a simple
+// expression like 2, not like 1+1).
+#define MY_BENCHMARK_WITH_ARG(n, a) \
+ bool __benchmark_ ## n ## a = \
+ (new ::testing::Benchmark(#n, NewPermanentCallback(&n)))->ThreadRange(1, NumCPUs());
+
+#define DO24(A, B) \
+ A(B, 1); A(B, 2); A(B, 3); A(B, 4); A(B, 5); A(B, 6); \
+ A(B, 7); A(B, 8); A(B, 9); A(B, 10); A(B, 11); A(B, 12); \
+ A(B, 13); A(B, 14); A(B, 15); A(B, 16); A(B, 17); A(B, 18); \
+ A(B, 19); A(B, 20); A(B, 21); A(B, 22); A(B, 23); A(B, 24);
+
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillPCRE)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillNFA)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillRE2)
+DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA)
+
+#undef DO24
+#undef MY_BENCHMARK_WITH_ARG
+*/
+
+////////////////////////////////////////////////////////////////////////
+//
+// Implementation routines. Sad that there are so many,
+// but all the interfaces are slightly different.
+
+// Runs implementation to search for regexp in text, iters times.
+// Expect_match says whether the regexp should be found.
+// Anchored says whether to run an anchored search.
+
+void SearchDFA(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ bool failed = false;
+ CHECK_EQ(prog->SearchDFA(text, NULL, anchor, Prog::kFirstMatch,
+ NULL, &failed),
+ expect_match);
+ CHECK(!failed);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchNFA(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchOnePass(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchBitState(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+ }
+}
+
+void SearchPCRE(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(PCRE::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(PCRE::PartialMatch(text, re), expect_match);
+ }
+}
+
+void SearchRE2(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ for (int i = 0; i < iters; i++) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(RE2::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(RE2::PartialMatch(text, re), expect_match);
+ }
+}
+
+// SearchCachedXXX is like SearchXXX but only does the
+// regexp parsing and compiling once. This lets us measure
+// search time without the per-regexp overhead.
+
+void SearchCachedDFA(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(1LL<<31);
+ CHECK(prog);
+ for (int i = 0; i < iters; i++) {
+ bool failed = false;
+ CHECK_EQ(prog->SearchDFA(text, NULL, anchor,
+ Prog::kFirstMatch, NULL, &failed),
+ expect_match);
+ CHECK(!failed);
+ }
+ delete prog;
+ re->Decref();
+}
+
+void SearchCachedNFA(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ for (int i = 0; i < iters; i++) {
+ CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ }
+ delete prog;
+ re->Decref();
+}
+
+void SearchCachedOnePass(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ for (int i = 0; i < iters; i++)
+ CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+}
+
+void SearchCachedBitState(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ for (int i = 0; i < iters; i++)
+ CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0),
+ expect_match);
+ delete prog;
+ re->Decref();
+}
+
+void SearchCachedPCRE(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(PCRE::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(PCRE::PartialMatch(text, re), expect_match);
+ }
+}
+
+void SearchCachedRE2(int iters, const char* regexp, const StringPiece& text,
+ Prog::Anchor anchor, bool expect_match) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ if (anchor == Prog::kAnchored)
+ CHECK_EQ(RE2::FullMatch(text, re), expect_match);
+ else
+ CHECK_EQ(RE2::PartialMatch(text, re), expect_match);
+ }
+}
+
+
+// Runs implementation to full match regexp against text,
+// extracting three submatches. Expects match always.
+
+void Parse3NFA(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3OnePass(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3BitState(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3Backtrack(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse3PCRE(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1, sp2, sp3;
+ CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3RE2(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1, sp2, sp3;
+ CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3CachedNFA(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++) {
+ CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ }
+ delete prog;
+ re->Decref();
+}
+
+void Parse3CachedOnePass(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+}
+
+void Parse3CachedBitState(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+}
+
+void Parse3CachedBacktrack(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[4]; // 4 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
+ delete prog;
+ re->Decref();
+}
+
+void Parse3CachedPCRE(int iters, const char* regexp, const StringPiece& text) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1, sp2, sp3;
+ for (int i = 0; i < iters; i++) {
+ CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+void Parse3CachedRE2(int iters, const char* regexp, const StringPiece& text) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1, sp2, sp3;
+ for (int i = 0; i < iters; i++) {
+ CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
+ }
+}
+
+
+// Runs implementation to full match regexp against text,
+// extracting three submatches. Expects match always.
+
+void Parse1NFA(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1OnePass(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1BitState(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+ }
+}
+
+void Parse1PCRE(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1;
+ CHECK(PCRE::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1RE2(int iters, const char* regexp, const StringPiece& text) {
+ for (int i = 0; i < iters; i++) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1;
+ CHECK(RE2::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1CachedNFA(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++) {
+ CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ }
+ delete prog;
+ re->Decref();
+}
+
+void Parse1CachedOnePass(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ CHECK(prog->IsOnePass());
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+}
+
+void Parse1CachedBitState(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+}
+
+void Parse1CachedBacktrack(int iters, const char* regexp, const StringPiece& text) {
+ Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
+ CHECK(re);
+ Prog* prog = re->CompileToProg(0);
+ CHECK(prog);
+ StringPiece sp[2]; // 2 because sp[0] is whole match.
+ for (int i = 0; i < iters; i++)
+ CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
+ delete prog;
+ re->Decref();
+}
+
+void Parse1CachedPCRE(int iters, const char* regexp, const StringPiece& text) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1;
+ for (int i = 0; i < iters; i++) {
+ CHECK(PCRE::FullMatch(text, re, &sp1));
+ }
+}
+
+void Parse1CachedRE2(int iters, const char* regexp, const StringPiece& text) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ StringPiece sp1;
+ for (int i = 0; i < iters; i++) {
+ CHECK(RE2::FullMatch(text, re, &sp1));
+ }
+}
+
+void SearchParse2CachedPCRE(int iters, const char* regexp,
+ const StringPiece& text) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ StringPiece sp1, sp2;
+ CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2));
+ }
+}
+
+void SearchParse2CachedRE2(int iters, const char* regexp,
+ const StringPiece& text) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ StringPiece sp1, sp2;
+ CHECK(RE2::PartialMatch(text, re, &sp1, &sp2));
+ }
+}
+
+void SearchParse1CachedPCRE(int iters, const char* regexp,
+ const StringPiece& text) {
+ PCRE re(regexp, PCRE::UTF8);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ StringPiece sp1;
+ CHECK(PCRE::PartialMatch(text, re, &sp1));
+ }
+}
+
+void SearchParse1CachedRE2(int iters, const char* regexp,
+ const StringPiece& text) {
+ RE2 re(regexp);
+ CHECK_EQ(re.error(), "");
+ for (int i = 0; i < iters; i++) {
+ StringPiece sp1;
+ CHECK(RE2::PartialMatch(text, re, &sp1));
+ }
+}
+
+void EmptyPartialMatchPCRE(int n) {
+ PCRE re("");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch("", re);
+ }
+}
+
+void EmptyPartialMatchRE2(int n) {
+ RE2 re("");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch("", re);
+ }
+}
+#ifdef USEPCRE
+BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+void SimplePartialMatchPCRE(int n) {
+ PCRE re("abcdefg");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch("abcdefg", re);
+ }
+}
+
+void SimplePartialMatchRE2(int n) {
+ RE2 re("abcdefg");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch("abcdefg", re);
+ }
+}
+#ifdef USEPCRE
+BENCHMARK(SimplePartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(SimplePartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+static string http_text =
+ "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf"
+ "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1";
+
+void HTTPPartialMatchPCRE(int n) {
+ StringPiece a;
+ PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void HTTPPartialMatchRE2(int n) {
+ StringPiece a;
+ RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+static string http_smalltext =
+ "GET /abc HTTP/1.1";
+
+void SmallHTTPPartialMatchPCRE(int n) {
+ StringPiece a;
+ PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void SmallHTTPPartialMatchRE2(int n) {
+ StringPiece a;
+ RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs());
+
+void DotMatchPCRE(int n) {
+ StringPiece a;
+ PCRE re("(?-s)^(.+)");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void DotMatchRE2(int n) {
+ StringPiece a;
+ RE2 re("(?-s)^(.+)");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs());
+
+void ASCIIMatchPCRE(int n) {
+ StringPiece a;
+ PCRE re("(?-s)^([ -~]+)");
+ for (int i = 0; i < n; i++) {
+ PCRE::PartialMatch(http_text, re, &a);
+ }
+}
+
+void ASCIIMatchRE2(int n) {
+ StringPiece a;
+ RE2 re("(?-s)^([ -~]+)");
+ for (int i = 0; i < n; i++) {
+ RE2::PartialMatch(http_text, re, &a);
+ }
+}
+
+#ifdef USEPCRE
+BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs());
+
+void FullMatchPCRE(int iter, int n, const char *regexp) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, n);
+ s += "ABCDEFGHIJ";
+ BenchmarkMemoryUsage();
+ PCRE re(regexp);
+ StartBenchmarkTiming();
+ for (int i = 0; i < iter; i++)
+ CHECK(PCRE::FullMatch(s, re));
+ SetBenchmarkBytesProcessed(static_cast<int64>(iter)*n);
+}
+
+void FullMatchRE2(int iter, int n, const char *regexp) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, n);
+ s += "ABCDEFGHIJ";
+ BenchmarkMemoryUsage();
+ RE2 re(regexp, RE2::Latin1);
+ StartBenchmarkTiming();
+ for (int i = 0; i < iter; i++)
+ CHECK(RE2::FullMatch(s, re));
+ SetBenchmarkBytesProcessed(static_cast<int64>(iter)*n);
+}
+
+void FullMatch_DotStar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*"); }
+void FullMatch_DotStar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*"); }
+
+void FullMatch_DotStarDollar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*$"); }
+void FullMatch_DotStarDollar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*$"); }
+
+void FullMatch_DotStarCapture_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s)((.*)()()($))"); }
+void FullMatch_DotStarCapture_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s)((.*)()()($))"); }
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20);
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20);
+
+#ifdef USEPCRE
+BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20);
+#endif
+BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20);
+
+} // namespace re2
diff --git a/re2/testing/regexp_generator.cc b/re2/testing/regexp_generator.cc
new file mode 100644
index 0000000..cf2db11
--- /dev/null
+++ b/re2/testing/regexp_generator.cc
@@ -0,0 +1,264 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression generator: generates all possible
+// regular expressions within parameters (see regexp_generator.h for details).
+
+// The regexp generator first generates a sequence of commands in a simple
+// postfix language. Each command in the language is a string,
+// like "a" or "%s*" or "%s|%s".
+//
+// To evaluate a command, enough arguments are popped from the value stack to
+// plug into the %s slots. Then the result is pushed onto the stack.
+// For example, the command sequence
+// a b %s%s c
+// results in the stack
+// ab c
+//
+// GeneratePostfix generates all possible command sequences.
+// Then RunPostfix turns each sequence into a regular expression
+// and passes the regexp to HandleRegexp.
+
+#include <string.h>
+#include <string>
+#include <stack>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns a vector of the egrep regexp operators.
+const vector<string>& RegexpGenerator::EgrepOps() {
+ static const char *ops[] = {
+ "%s%s",
+ "%s|%s",
+ "%s*",
+ "%s+",
+ "%s?",
+ "%s\\C*",
+ };
+ static vector<string> v(ops, ops + arraysize(ops));
+ return v;
+}
+
+RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
+ const vector<string>& atoms,
+ const vector<string>& ops)
+ : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
+ // Degenerate case.
+ if (atoms_.size() == 0)
+ maxatoms_ = 0;
+ if (ops_.size() == 0)
+ maxops_ = 0;
+}
+
+// Generates all possible regular expressions (within the parameters),
+// calling HandleRegexp for each one.
+void RegexpGenerator::Generate() {
+ vector<string> postfix;
+ GeneratePostfix(&postfix, 0, 0, 0);
+}
+
+// Generates random regular expressions, calling HandleRegexp for each one.
+void RegexpGenerator::GenerateRandom(int32 seed, int n) {
+ ACMRandom acm(seed);
+ acm_ = &acm;
+
+ for (int i = 0; i < n; i++) {
+ vector<string> postfix;
+ GenerateRandomPostfix(&postfix, 0, 0, 0);
+ }
+
+ acm_ = NULL;
+}
+
+// Counts and returns the number of occurrences of "%s" in s.
+static int CountArgs(const string& s) {
+ const char *p = s.c_str();
+ int n = 0;
+ while ((p = strstr(p, "%s")) != NULL) {
+ p += 2;
+ n++;
+ }
+ return n;
+}
+
+// Generates all possible postfix command sequences.
+// Each sequence is handed off to RunPostfix to generate a regular expression.
+// The arguments are:
+// post: the current postfix sequence
+// nstk: the number of elements that would be on the stack after executing
+// the sequence
+// ops: the number of operators used in the sequence
+// atoms: the number of atoms used in the sequence
+// For example, if post were ["a", "b", "%s%s", "c"],
+// then nstk = 2, ops = 1, atoms = 3.
+//
+// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
+//
+void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
+ int ops, int atoms) {
+ if (nstk == 1)
+ RunPostfix(*post);
+
+ // Early out: if used too many operators or can't
+ // get back down to a single expression on the stack
+ // using binary operators, give up.
+ if (ops + nstk - 1 > maxops_)
+ return;
+
+ // Add atoms if there is room.
+ if (atoms < maxatoms_) {
+ for (int i = 0; i < atoms_.size(); i++) {
+ post->push_back(atoms_[i]);
+ GeneratePostfix(post, nstk + 1, ops, atoms + 1);
+ post->pop_back();
+ }
+ }
+
+ // Add operators if there are enough arguments.
+ if (ops < maxops_) {
+ for (int i = 0; i < ops_.size(); i++) {
+ const string& fmt = ops_[i];
+ int nargs = CountArgs(fmt);
+ if (nargs <= nstk) {
+ post->push_back(fmt);
+ GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
+ post->pop_back();
+ }
+ }
+ }
+}
+
+// Generates a random postfix command sequence.
+// Stops and returns true once a single sequence has been generated.
+bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
+ int ops, int atoms) {
+ for (;;) {
+ // Stop if we get to a single element, but only sometimes.
+ if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
+ RunPostfix(*post);
+ return true;
+ }
+
+ // Early out: if used too many operators or can't
+ // get back down to a single expression on the stack
+ // using binary operators, give up.
+ if (ops + nstk - 1 > maxops_)
+ return false;
+
+ // Add operators if there are enough arguments.
+ if (ops < maxops_ && acm_->Uniform(2) == 0) {
+ const string& fmt = ops_[acm_->Uniform(ops_.size())];
+ int nargs = CountArgs(fmt);
+ if (nargs <= nstk) {
+ post->push_back(fmt);
+ bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
+ ops + 1, atoms);
+ post->pop_back();
+ if (ret)
+ return true;
+ }
+ }
+
+ // Add atoms if there is room.
+ if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
+ post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
+ bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
+ post->pop_back();
+ if (ret)
+ return true;
+ }
+ }
+}
+
+// Interprets the postfix command sequence to create a regular expression
+// passed to HandleRegexp. The results of operators like %s|%s are wrapped
+// in (?: ) to avoid needing to maintain a precedence table.
+void RegexpGenerator::RunPostfix(const vector<string>& post) {
+ stack<string> regexps;
+ for (int i = 0; i < post.size(); i++) {
+ switch (CountArgs(post[i])) {
+ default:
+ LOG(FATAL) << "Bad operator: " << post[i];
+ case 0:
+ regexps.push(post[i]);
+ break;
+ case 1: {
+ string a = regexps.top();
+ regexps.pop();
+ regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
+ break;
+ }
+ case 2: {
+ string b = regexps.top();
+ regexps.pop();
+ string a = regexps.top();
+ regexps.pop();
+ regexps.push("(?:" +
+ StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
+ ")");
+ break;
+ }
+ }
+ }
+
+ if (regexps.size() != 1) {
+ // Internal error - should never happen.
+ printf("Bad regexp program:\n");
+ for (int i = 0; i < post.size(); i++) {
+ printf(" %s\n", CEscape(post[i]).c_str());
+ }
+ printf("Stack after running program:\n");
+ while (!regexps.empty()) {
+ printf(" %s\n", CEscape(regexps.top()).c_str());
+ regexps.pop();
+ }
+ LOG(FATAL) << "Bad regexp program.";
+ }
+
+ HandleRegexp(regexps.top());
+ HandleRegexp("^(?:" + regexps.top() + ")$");
+ HandleRegexp("^(?:" + regexps.top() + ")");
+ HandleRegexp("(?:" + regexps.top() + ")$");
+}
+
+// Split s into an vector of strings, one for each UTF-8 character.
+vector<string> Explode(const StringPiece& s) {
+ vector<string> v;
+
+ for (const char *q = s.begin(); q < s.end(); ) {
+ const char* p = q;
+ Rune r;
+ q += chartorune(&r, q);
+ v.push_back(string(p, q - p));
+ }
+
+ return v;
+}
+
+// Split string everywhere a substring is found, returning
+// vector of pieces.
+vector<string> Split(const StringPiece& sep, const StringPiece& s) {
+ vector<string> v;
+
+ if (sep.size() == 0)
+ return Explode(s);
+
+ const char *p = s.begin();
+ for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
+ if (StringPiece(q, sep.size()) == sep) {
+ v.push_back(string(p, q - p));
+ p = q + sep.size();
+ q = p - 1; // -1 for ++ in loop
+ continue;
+ }
+ }
+ if (p < s.end())
+ v.push_back(string(p, s.end() - p));
+ return v;
+}
+
+} // namespace re2
diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h
new file mode 100644
index 0000000..b4506f2
--- /dev/null
+++ b/re2/testing/regexp_generator.h
@@ -0,0 +1,70 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression generator: generates all possible
+// regular expressions within given parameters (see below for details).
+
+#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
+#define RE2_TESTING_REGEXP_GENERATOR_H__
+
+#include <string>
+#include <vector>
+#include "util/random.h"
+#include "util/util.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+// Regular expression generator.
+//
+// Given a set of atom expressions like "a", "b", or "."
+// and operators like "%s*", generates all possible regular expressions
+// using at most maxbases base expressions and maxops operators.
+// For each such expression re, calls HandleRegexp(re).
+//
+// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
+//
+class RegexpGenerator {
+ public:
+ RegexpGenerator(int maxatoms, int maxops, const vector<string>& atoms,
+ const vector<string>& ops);
+ virtual ~RegexpGenerator() {}
+
+ // Generates all the regular expressions, calling HandleRegexp(re) for each.
+ void Generate();
+
+ // Generates n random regular expressions, calling HandleRegexp(re) for each.
+ void GenerateRandom(int32 seed, int n);
+
+ // Handles a regular expression. Must be provided by subclass.
+ virtual void HandleRegexp(const string& regexp) = 0;
+
+ // The egrep regexp operators: * + ? | and concatenation.
+ static const vector<string>& EgrepOps();
+
+ private:
+ void RunPostfix(const vector<string>& post);
+ void GeneratePostfix(vector<string>* post, int nstk, int ops, int lits);
+ bool GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int lits);
+
+ int maxatoms_; // Maximum number of atoms allowed in expr.
+ int maxops_; // Maximum number of ops allowed in expr.
+ vector<string> atoms_; // Possible atoms.
+ vector<string> ops_; // Possible ops.
+ ACMRandom* acm_; // Random generator.
+ DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator);
+};
+
+// Helpers for preparing arguments to RegexpGenerator constructor.
+
+// Returns one string for each character in s.
+vector<string> Explode(const StringPiece& s);
+
+// Splits string everywhere sep is found, returning
+// vector of pieces.
+vector<string> Split(const StringPiece& sep, const StringPiece& s);
+
+} // namespace re2
+
+#endif // RE2_TESTING_REGEXP_GENERATOR_H__
diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
new file mode 100644
index 0000000..1f0b216
--- /dev/null
+++ b/re2/testing/required_prefix_test.cc
@@ -0,0 +1,67 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct PrefixTest {
+ const char* regexp;
+ bool return_value;
+ const char* prefix;
+ bool foldcase;
+ const char* suffix;
+};
+
+static PrefixTest tests[] = {
+ // If the regexp is missing a ^, there's no required prefix.
+ { "abc", false },
+ { "", false },
+ { "(?m)^", false },
+
+ // If the regexp immediately goes into
+ // something not a literal match, there's no required prefix.
+ { "^(abc)", false },
+ { "^a*", false },
+
+ // Otherwise, it should work.
+ { "^abc$", true, "abc", false, "(?-m:$)" },
+ { "^abc", "true", "abc", false, "" },
+ { "^(?i)abc", true, "abc", true, "" },
+ { "^abcd*", true, "abc", false, "d*" },
+ { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
+ { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
+ { "^☺abc", true, "☺abc", false, "" },
+};
+
+TEST(RequiredPrefix, SimpleTests) {
+ for (int i = 0; i < arraysize(tests); i++) {
+ const PrefixTest& t = tests[i];
+ for (int j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ CHECK(re) << " " << t.regexp;
+ string p;
+ bool f = false;
+ Regexp* s = NULL;
+ CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
+ << " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump();
+ if (t.return_value) {
+ CHECK_EQ(p, string(t.prefix))
+ << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+ CHECK_EQ(f, t.foldcase)
+ << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+ CHECK_EQ(s->ToString(), string(t.suffix))
+ << " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
+ s->Decref();
+ }
+ re->Decref();
+ }
+ }
+}
+
+} // namespace re2
diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
new file mode 100644
index 0000000..419d7ba
--- /dev/null
+++ b/re2/testing/search_test.cc
@@ -0,0 +1,279 @@
+// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdlib.h>
+#include <vector>
+#include "util/test.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/testing/tester.h"
+
+namespace re2 {
+
+struct RegexpTest {
+ const char* regexp;
+ const char* text;
+};
+
+RegexpTest simple_tests[] = {
+ { "a", "a" },
+ { "a", "zyzzyva" },
+ { "a+", "aa" },
+ { "(a+|b)+", "ab" },
+ { "ab|cd", "xabcdx" },
+ { "h.*od?", "hello\ngoodbye\n" },
+ { "h.*o", "hello\ngoodbye\n" },
+ { "h.*o", "goodbye\nhello\n" },
+ { "h.*o", "hello world" },
+ { "h.*o", "othello, world" },
+ { "[^\\s\\S]", "aaaaaaa" },
+ { "a", "aaaaaaa" },
+ { "a*", "aaaaaaa" },
+ { "a*", "" },
+ { "a*", NULL },
+ { "ab|cd", "xabcdx" },
+ { "a", "cab" },
+ { "a*b", "cab" },
+ { "((((((((((((((((((((x))))))))))))))))))))", "x" },
+ { "[abcd]", "xxxabcdxxx" },
+ { "[^x]", "xxxabcdxxx" },
+ { "[abcd]+", "xxxabcdxxx" },
+ { "[^x]+", "xxxabcdxxx" },
+ { "(fo|foo)", "fo" },
+ { "(foo|fo)", "foo" },
+
+ { "aa", "aA" },
+ { "a", "Aa" },
+ { "a", "A" },
+ { "ABC", "abc" },
+ { "abc", "XABCY" },
+ { "ABC", "xabcy" },
+
+ // Make sure ^ and $ work.
+ // The pathological cases didn't work
+ // in the original grep code.
+ { "foo|bar|[A-Z]", "foo" },
+ { "^(foo|bar|[A-Z])", "foo" },
+ { "(foo|bar|[A-Z])$", "foo\n" },
+ { "(foo|bar|[A-Z])$", "foo" },
+ { "^(foo|bar|[A-Z])$", "foo\n" },
+ { "^(foo|bar|[A-Z])$", "foo" },
+ { "^(foo|bar|[A-Z])$", "bar" },
+ { "^(foo|bar|[A-Z])$", "X" },
+ { "^(foo|bar|[A-Z])$", "XY" },
+ { "^(fo|foo)$", "fo" },
+ { "^(fo|foo)$", "foo" },
+ { "^^(fo|foo)$", "fo" },
+ { "^^(fo|foo)$", "foo" },
+ { "^$", "" },
+ { "^$", "x" },
+ { "^^$", "" },
+ { "^$$", "" },
+ { "^^$", "x" },
+ { "^$$", "x" },
+ { "^^$$", "" },
+ { "^^$$", "x" },
+ { "^^^^^^^^$$$$$$$$", "" },
+ { "^", "x" },
+ { "$", "x" },
+
+ // Word boundaries.
+ { "\\bfoo\\b", "nofoo foo that" },
+ { "a\\b", "faoa x" },
+ { "\\bbar", "bar x" },
+ { "\\bbar", "foo\nbar x" },
+ { "bar\\b", "foobar" },
+ { "bar\\b", "foobar\nxxx" },
+ { "(foo|bar|[A-Z])\\b", "foo" },
+ { "(foo|bar|[A-Z])\\b", "foo\n" },
+ { "\\b", "" },
+ { "\\b", "x" },
+ { "\\b(foo|bar|[A-Z])", "foo" },
+ { "\\b(foo|bar|[A-Z])\\b", "X" },
+ { "\\b(foo|bar|[A-Z])\\b", "XY" },
+ { "\\b(foo|bar|[A-Z])\\b", "bar" },
+ { "\\b(foo|bar|[A-Z])\\b", "foo" },
+ { "\\b(foo|bar|[A-Z])\\b", "foo\n" },
+ { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
+ { "\\b(fo|foo)\\b", "fo" },
+ { "\\b(fo|foo)\\b", "foo" },
+ { "\\b\\b", "" },
+ { "\\b\\b", "x" },
+ { "\\b$", "" },
+ { "\\b$", "x" },
+ { "\\b$", "y x" },
+ { "\\b.$", "x" },
+ { "^\\b(fo|foo)\\b", "fo" },
+ { "^\\b(fo|foo)\\b", "foo" },
+ { "^\\b", "" },
+ { "^\\b", "x" },
+ { "^\\b\\b", "" },
+ { "^\\b\\b", "x" },
+ { "^\\b$", "" },
+ { "^\\b$", "x" },
+ { "^\\b.$", "x" },
+ { "^\\b.\\b$", "x" },
+ { "^^^^^^^^\\b$$$$$$$", "" },
+ { "^^^^^^^^\\b.$$$$$$", "x" },
+ { "^^^^^^^^\\b$$$$$$$", "x" },
+
+ // Non-word boundaries.
+ { "\\Bfoo\\B", "n foo xfoox that" },
+ { "a\\B", "faoa x" },
+ { "\\Bbar", "bar x" },
+ { "\\Bbar", "foo\nbar x" },
+ { "bar\\B", "foobar" },
+ { "bar\\B", "foobar\nxxx" },
+ { "(foo|bar|[A-Z])\\B", "foox" },
+ { "(foo|bar|[A-Z])\\B", "foo\n" },
+ { "\\B", "" },
+ { "\\B", "x" },
+ { "\\B(foo|bar|[A-Z])", "foo" },
+ { "\\B(foo|bar|[A-Z])\\B", "xXy" },
+ { "\\B(foo|bar|[A-Z])\\B", "XY" },
+ { "\\B(foo|bar|[A-Z])\\B", "XYZ" },
+ { "\\B(foo|bar|[A-Z])\\B", "abara" },
+ { "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
+ { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
+ { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
+ { "\\B(fo|foo)\\B", "xfoo" },
+ { "\\B(foo|fo)\\B", "xfooo" },
+ { "\\B\\B", "" },
+ { "\\B\\B", "x" },
+ { "\\B$", "" },
+ { "\\B$", "x" },
+ { "\\B$", "y x" },
+ { "\\B.$", "x" },
+ { "^\\B(fo|foo)\\B", "fo" },
+ { "^\\B(fo|foo)\\B", "foo" },
+ { "^\\B", "" },
+ { "^\\B", "x" },
+ { "^\\B\\B", "" },
+ { "^\\B\\B", "x" },
+ { "^\\B$", "" },
+ { "^\\B$", "x" },
+ { "^\\B.$", "x" },
+ { "^\\B.\\B$", "x" },
+ { "^^^^^^^^\\B$$$$$$$", "" },
+ { "^^^^^^^^\\B.$$$$$$", "x" },
+ { "^^^^^^^^\\B$$$$$$$", "x" },
+
+ // PCRE uses only ASCII for \b computation.
+ // All non-ASCII are *not* word characters.
+ { "\\bx\\b", "x" },
+ { "\\bx\\b", "x>" },
+ { "\\bx\\b", "<x" },
+ { "\\bx\\b", "<x>" },
+ { "\\bx\\b", "ax" },
+ { "\\bx\\b", "xb" },
+ { "\\bx\\b", "axb" },
+ { "\\bx\\b", "«x" },
+ { "\\bx\\b", "x»" },
+ { "\\bx\\b", "«x»" },
+ { "\\bx\\b", "axb" },
+ { "\\bx\\b", "áxβ" },
+ { "\\Bx\\B", "axb" },
+ { "\\Bx\\B", "áxβ" },
+
+ // Weird boundary cases.
+ { "^$^$", "" },
+ { "^$^", "" },
+ { "$^$", "" },
+
+ { "^$^$", "x" },
+ { "^$^", "x" },
+ { "$^$", "x" },
+
+ { "^$^$", "x\ny" },
+ { "^$^", "x\ny" },
+ { "$^$", "x\ny" },
+
+ { "^$^$", "x\n\ny" },
+ { "^$^", "x\n\ny" },
+ { "$^$", "x\n\ny" },
+
+ { "^(foo\\$)$", "foo$bar" },
+ { "(foo\\$)", "foo$bar" },
+ { "^...$", "abc" },
+
+ // UTF-8
+ { "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
+ { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^...$", ".\xe6\x9c\xac." },
+
+ { "^\\C\\C\\C$", "\xe6\x9c\xac" },
+ { "^\\C$", "\xe6\x9c\xac" },
+ { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+
+ // Latin1
+ { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
+ { "^...$", ".\xe6\x9c\xac." },
+ { "^.....$", ".\xe6\x9c\xac." },
+
+ // Perl v Posix
+ { "\\B(fo|foo)\\B", "xfooo" },
+ { "(fo|foo)", "foo" },
+
+ // Octal escapes.
+ { "\\141", "a" },
+ { "\\060", "0" },
+ { "\\0600", "00" },
+ { "\\608", "08" },
+ { "\\01", "\01" },
+ { "\\018", "\01" "8" },
+
+ // Hexadecimal escapes
+ { "\\x{61}", "a" },
+ { "\\x61", "a" },
+ { "\\x{00000061}", "a" },
+
+ // Unicode scripts.
+ { "\\p{Greek}+", "aαβb" },
+ { "\\P{Greek}+", "aαβb" },
+ { "\\p{^Greek}+", "aαβb" },
+ { "\\P{^Greek}+", "aαβb" },
+
+ // Unicode properties. Nd is decimal number. N is any number.
+ { "[^0-9]+", "abc123" },
+ { "\\p{Nd}+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\p{^Nd}+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\P{Nd}+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\P{^Nd}+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\pN+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\p{N}+", "abc123²³¼½¾â‚€â‚‰" },
+ { "\\p{^N}+", "abc123²³¼½¾â‚€â‚‰" },
+
+ { "\\p{Any}+", "abc123" },
+
+ // Character classes & case folding.
+ { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
+ { "(?i)[A-Z]+", "aAzZ" },
+ { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
+ // splits the ranges in an interesting way.
+
+ // would like to use, but PCRE mishandles in full-match, non-greedy mode
+ // { "(?i)[\\\\]+", "Aa" },
+
+ { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+
+ // Character classes & case folding.
+ { "[@-A]+", "@AaB" },
+ { "[A-Z]+", "aAzZ" },
+ { "[^\\\\]+", "Aa\\" },
+ { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+
+};
+
+TEST(Regexp, SearchTests) {
+ int failures = 0;
+ for (int i = 0; i < arraysize(simple_tests); i++) {
+ const RegexpTest& t = simple_tests[i];
+ if (!TestRegexpOnText(t.regexp, t.text))
+ failures++;
+ }
+ EXPECT_EQ(failures, 0);
+}
+
+} // namespace re2
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
new file mode 100644
index 0000000..6f6cfc5
--- /dev/null
+++ b/re2/testing/simplify_test.cc
@@ -0,0 +1,166 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test simplify.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Test {
+ const char* regexp;
+ const char* simplified;
+};
+
+static Test tests[] = {
+ // Already-simple constructs
+ { "a", "a" },
+ { "ab", "ab" },
+ { "a|b", "[a-b]" },
+ { "ab|cd", "ab|cd" },
+ { "(ab)*", "(ab)*" },
+ { "(ab)+", "(ab)+" },
+ { "(ab)?", "(ab)?" },
+ { ".", "." },
+ { "^", "^" },
+ { "$", "$" },
+ { "[ac]", "[ac]" },
+ { "[^ac]", "[^ac]" },
+
+ // Posix character classes
+ { "[[:alnum:]]", "[0-9A-Za-z]" },
+ { "[[:alpha:]]", "[A-Za-z]" },
+ { "[[:blank:]]", "[\\t ]" },
+ { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
+ { "[[:digit:]]", "[0-9]" },
+ { "[[:graph:]]", "[!-~]" },
+ { "[[:lower:]]", "[a-z]" },
+ { "[[:print:]]", "[ -~]" },
+ { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
+ { "[[:space:]]" , "[\\t-\\r ]" },
+ { "[[:upper:]]", "[A-Z]" },
+ { "[[:xdigit:]]", "[0-9A-Fa-f]" },
+
+ // Perl character classes
+ { "\\d", "[0-9]" },
+ { "\\s", "[\\t-\\n\\f-\\r ]" },
+ { "\\w", "[0-9A-Z_a-z]" },
+ { "\\D", "[^0-9]" },
+ { "\\S", "[^\\t-\\n\\f-\\r ]" },
+ { "\\W", "[^0-9A-Z_a-z]" },
+ { "[\\d]", "[0-9]" },
+ { "[\\s]", "[\\t-\\n\\f-\\r ]" },
+ { "[\\w]", "[0-9A-Z_a-z]" },
+ { "[\\D]", "[^0-9]" },
+ { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
+ { "[\\W]", "[^0-9A-Z_a-z]" },
+
+ // Posix repetitions
+ { "a{1}", "a" },
+ { "a{2}", "aa" },
+ { "a{5}", "aaaaa" },
+ { "a{0,1}", "a?" },
+ // The next three are illegible because Simplify inserts (?:)
+ // parens instead of () parens to avoid creating extra
+ // captured subexpressions. The comments show a version fewer parens.
+ { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
+ { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
+ { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
+ { "a{0,2}", "(?:aa?)?" }, // (aa?)?
+ { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
+ { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
+ { "a{0,}", "a*" },
+ { "a{1,}", "a+" },
+ { "a{2,}", "aa+" },
+ { "a{5,}", "aaaaa+" },
+
+ // Test that operators simplify their arguments.
+ // (Simplify used to not simplify arguments to a {} repeat.)
+ { "(?:a{1,}){1,}", "a+" },
+ { "(a{1,}b{1,})", "(a+b+)" },
+ { "a{1,}|b{1,}", "a+|b+" },
+ { "(?:a{1,})*", "(?:a+)*" },
+ { "(?:a{1,})+", "a+" },
+ { "(?:a{1,})?", "(?:a+)?" },
+ { "a{0}", "" },
+
+ // Character class simplification
+ { "[ab]", "[a-b]" },
+ { "[a-za-za-z]", "[a-z]" },
+ { "[A-Za-zA-Za-z]", "[A-Za-z]" },
+ { "[ABCDEFGH]", "[A-H]" },
+ { "[AB-CD-EF-GH]", "[A-H]" },
+ { "[W-ZP-XE-R]", "[E-Z]" },
+ { "[a-ee-gg-m]", "[a-m]" },
+ { "[a-ea-ha-m]", "[a-m]" },
+ { "[a-ma-ha-e]", "[a-m]" },
+ { "[a-zA-Z0-9 -~]", "[ -~]" },
+
+ // Empty character classes
+ { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
+
+ // Full character classes
+ { "[[:cntrl:][:^cntrl:]]", "." },
+
+ // Unicode case folding.
+ { "(?i)A", "[Aa]" },
+ { "(?i)a", "[Aa]" },
+ { "(?i)K", "[Kk\\x{212a}]" },
+ { "(?i)k", "[Kk\\x{212a}]" },
+ { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
+ { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
+ { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
+ { "(?i)[\\x00-\\x{10ffff}]", "." },
+
+ // Empty string as a regular expression.
+ // Empty string must be preserved inside parens in order
+ // to make submatches work right, so these are less
+ // interesting than they used to be. ToString inserts
+ // explicit (?:) in place of non-parenthesized empty strings,
+ // to make them easier to spot for other parsers.
+ { "(a|b|)", "([a-b]|(?:))" },
+ { "(|)", "((?:)|(?:))" },
+ { "a()", "a()" },
+ { "(()|())", "(()|())" },
+ { "(a|)", "(a|(?:))" },
+ { "ab()cd()", "ab()cd()" },
+ { "()", "()" },
+ { "()*", "()*" },
+ { "()+", "()+" },
+ { "()?" , "()?" },
+ { "(){0}", "" },
+ { "(){1}", "()" },
+ { "(){1,}", "()+" },
+ { "(){0,2}", "(?:()()?)?" },
+};
+
+TEST(TestSimplify, SimpleRegexps) {
+ for (int i = 0; i < arraysize(tests); i++) {
+ RegexpStatus status;
+ VLOG(1) << "Testing " << tests[i].regexp;
+ Regexp* re = Regexp::Parse(tests[i].regexp,
+ Regexp::MatchNL | (Regexp::LikePerl &
+ ~Regexp::OneLine),
+ &status);
+ CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
+ Regexp* sre = re->Simplify();
+ CHECK(sre != NULL);
+
+ // Check that already-simple regexps don't allocate new ones.
+ if (strcmp(tests[i].regexp, tests[i].simplified) == 0)
+ CHECK(re == sre) << " " << tests[i].regexp
+ << " " << re->ToString() << " " << sre->ToString();
+
+ EXPECT_EQ(tests[i].simplified, sre->ToString())
+ << " " << tests[i].regexp << " " << sre->Dump();
+
+ re->Decref();
+ sre->Decref();
+ }
+}
+
+} // namespace re2
diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc
new file mode 100644
index 0000000..4fdcb38
--- /dev/null
+++ b/re2/testing/string_generator.cc
@@ -0,0 +1,114 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+StringGenerator::StringGenerator(int maxlen, const vector<string>& alphabet)
+ : maxlen_(maxlen), alphabet_(alphabet),
+ generate_null_(false),
+ random_(false), nrandom_(0), acm_(NULL) {
+
+ // Degenerate case: no letters, no non-empty strings.
+ if (alphabet_.size() == 0)
+ maxlen_ = 0;
+
+ // Next() will return empty string (digits_ is empty).
+ hasnext_ = true;
+}
+
+StringGenerator::~StringGenerator() {
+ if (acm_)
+ delete acm_;
+}
+
+// Resets the string generator state to the beginning.
+void StringGenerator::Reset() {
+ digits_.clear();
+ hasnext_ = true;
+ random_ = false;
+ nrandom_ = 0;
+ generate_null_ = false;
+}
+
+// Increments the big number in digits_, returning true if successful.
+// Returns false if all the numbers have been used.
+bool StringGenerator::IncrementDigits() {
+ // First try to increment the current number.
+ for (int i = digits_.size() - 1; i >= 0; i--) {
+ if (++digits_[i] < alphabet_.size())
+ return true;
+ digits_[i] = 0;
+ }
+
+ // If that failed, make a longer number.
+ if (digits_.size() < maxlen_) {
+ digits_.push_back(0);
+ return true;
+ }
+
+ return false;
+}
+
+// Generates random digits_, return true if successful.
+// Returns false if the random sequence is over.
+bool StringGenerator::RandomDigits() {
+ if (--nrandom_ <= 0)
+ return false;
+
+ // Pick length.
+ int len = acm_->Uniform(maxlen_+1);
+ digits_.resize(len);
+ for (int i = 0; i < len; i++)
+ digits_[i] = acm_->Uniform(alphabet_.size());
+ return true;
+}
+
+// Returns the next string in the iteration, which is the one
+// currently described by digits_. Calls IncrementDigits
+// after computing the string, so that it knows the answer
+// for subsequent HasNext() calls.
+const StringPiece& StringGenerator::Next() {
+ CHECK(hasnext_);
+ if (generate_null_) {
+ generate_null_ = false;
+ sp_ = NULL;
+ return sp_;
+ }
+ s_.clear();
+ for (int i = 0; i < digits_.size(); i++) {
+ s_ += alphabet_[digits_[i]];
+ }
+ hasnext_ = random_ ? RandomDigits() : IncrementDigits();
+ sp_ = s_;
+ return sp_;
+}
+
+// Sets generator up to return n random strings.
+void StringGenerator::Random(int32 seed, int n) {
+ if (acm_ == NULL)
+ acm_ = new ACMRandom(seed);
+ else
+ acm_->Reset(seed);
+
+ random_ = true;
+ nrandom_ = n;
+ hasnext_ = nrandom_ > 0;
+}
+
+void StringGenerator::GenerateNULL() {
+ generate_null_ = true;
+ hasnext_ = true;
+}
+
+} // namespace re2
+
diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h
new file mode 100644
index 0000000..6a9ef42
--- /dev/null
+++ b/re2/testing/string_generator.h
@@ -0,0 +1,58 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// String generator: generates all possible strings of up to
+// maxlen letters using the set of letters in alpha.
+// Fetch strings using a Java-like Next()/HasNext() interface.
+
+#ifndef RE2_TESTING_STRING_GENERATOR_H__
+#define RE2_TESTING_STRING_GENERATOR_H__
+
+#include <string>
+#include <vector>
+#include "util/util.h"
+#include "util/random.h"
+#include "re2/stringpiece.h"
+
+namespace re2 {
+
+class StringGenerator {
+ public:
+ StringGenerator(int maxlen, const vector<string>& alphabet);
+ ~StringGenerator();
+ const StringPiece& Next();
+ bool HasNext() { return hasnext_; }
+
+ // Resets generator to start sequence over.
+ void Reset();
+
+ // Causes generator to emit random strings for next n calls to Next().
+ void Random(int32 seed, int n);
+
+ // Causes generator to emit a NULL as the next call.
+ void GenerateNULL();
+
+ private:
+ bool IncrementDigits();
+ bool RandomDigits();
+
+ // Global state.
+ int maxlen_; // Maximum length string to generate.
+ vector<string> alphabet_; // Alphabet, one string per letter.
+
+ // Iteration state.
+ StringPiece sp_; // Last StringPiece returned by Next().
+ string s_; // String data in last StringPiece returned by Next().
+ bool hasnext_; // Whether Next() can be called again.
+ vector<int> digits_; // Alphabet indices for next string.
+ bool generate_null_; // Whether to generate a NULL StringPiece next.
+ bool random_; // Whether generated strings are random.
+ int nrandom_; // Number of random strings left to generate.
+ ACMRandom* acm_; // Random number generator
+ DISALLOW_EVIL_CONSTRUCTORS(StringGenerator);
+};
+
+} // namespace re2
+
+#endif // RE2_TESTING_STRING_GENERATOR_H__
diff --git a/re2/testing/string_generator_test.cc b/re2/testing/string_generator_test.cc
new file mode 100644
index 0000000..d13401a
--- /dev/null
+++ b/re2/testing/string_generator_test.cc
@@ -0,0 +1,109 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test StringGenerator.
+
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/testing/string_generator.h"
+#include "re2/testing/regexp_generator.h"
+
+namespace re2 {
+
+// Returns i to the e.
+static int64 IntegerPower(int i, int e) {
+ int64 p = 1;
+ while (e-- > 0)
+ p *= i;
+ return p;
+}
+
+// Checks that for given settings of the string generator:
+// * it generates strings that are non-decreasing in length.
+// * strings of the same length are sorted in alphabet order.
+// * it doesn't generate the same string twice.
+// * it generates the right number of strings.
+//
+// If all of these hold, the StringGenerator is behaving.
+// Assumes that the alphabet is sorted, so that the generated
+// strings can just be compared lexicographically.
+static void RunTest(int len, string alphabet, bool donull) {
+ StringGenerator g(len, Explode(alphabet));
+
+ int n = 0;
+ int last_l = -1;
+ string last_s;
+
+ if (donull) {
+ g.GenerateNULL();
+ EXPECT_TRUE(g.HasNext());
+ StringPiece sp = g.Next();
+ EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
+ EXPECT_EQ(sp.size(), 0);
+ }
+
+ while (g.HasNext()) {
+ string s = g.Next().as_string();
+ n++;
+
+ // Check that all characters in s appear in alphabet.
+ for (const char *p = s.c_str(); *p != '\0'; ) {
+ Rune r;
+ p += chartorune(&r, p);
+ EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
+ }
+
+ // Check that string is properly ordered w.r.t. previous string.
+ int l = utflen(s.c_str());
+ EXPECT_LE(l, len);
+ if (last_l < l) {
+ last_l = l;
+ } else {
+ EXPECT_EQ(last_l, l);
+ EXPECT_LT(last_s, s);
+ }
+ last_s = s;
+ }
+
+ // Check total string count.
+ int64 m = 0;
+ int alpha = utflen(alphabet.c_str());
+ if (alpha == 0) // Degenerate case.
+ len = 0;
+ for (int i = 0; i <= len; i++)
+ m += IntegerPower(alpha, i);
+ EXPECT_EQ(n, m);
+}
+
+TEST(StringGenerator, NoLength) {
+ RunTest(0, "abc", false);
+}
+
+TEST(StringGenerator, NoLengthNoAlphabet) {
+ RunTest(0, "", false);
+}
+
+TEST(StringGenerator, NoAlphabet) {
+ RunTest(5, "", false);
+}
+
+TEST(StringGenerator, Simple) {
+ RunTest(3, "abc", false);
+}
+
+TEST(StringGenerator, UTF8) {
+ RunTest(4, "abc\xE2\x98\xBA", false);
+}
+
+TEST(StringGenerator, GenNULL) {
+ RunTest(0, "abc", true);
+ RunTest(0, "", true);
+ RunTest(5, "", true);
+ RunTest(3, "abc", true);
+ RunTest(4, "abc\xE2\x98\xBA", true);
+}
+
+} // namespace re2
diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc
new file mode 100644
index 0000000..6987be9
--- /dev/null
+++ b/re2/testing/tester.cc
@@ -0,0 +1,636 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression engine tester -- test all the implementations against each other.
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "re2/testing/tester.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+DEFINE_bool(dump_prog, false, "dump regexp program");
+DEFINE_bool(log_okay, false, "log successful runs");
+DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
+
+DEFINE_int32(max_regexp_failures, 1,
+ "maximum number of regexp test failures (-1 = unlimited)");
+
+DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
+
+namespace re2 {
+
+enum {
+ kMaxSubmatch = 1+16, // $0...$16
+};
+
+const char* engine_types[kEngineMax] = {
+ "Backtrack",
+ "NFA",
+ "DFA",
+ "DFA1",
+ "OnePass",
+ "BitState",
+ "RE2",
+ "RE2a",
+ "RE2b",
+ "PCRE",
+};
+
+// Returns the name string for the type t.
+static string EngineString(Engine t) {
+ if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
+ return StringPrintf("type%d", static_cast<int>(t));
+ }
+ return engine_types[t];
+}
+
+// Returns bit mask of engines to use.
+static uint32 Engines() {
+ static uint32 cached_engines;
+ static bool did_parse;
+
+ if (did_parse)
+ return cached_engines;
+
+ if (FLAGS_regexp_engines.empty()) {
+ cached_engines = ~0;
+ } else {
+ for (int i = 0; i < kEngineMax; i++)
+ if(strstr(EngineString(static_cast<Engine>(i)).c_str(),
+ FLAGS_regexp_engines.c_str()))
+ cached_engines |= 1<<i;
+ }
+
+ if (cached_engines == 0)
+ LOG(INFO) << "Warning: no engines enabled.";
+ if (!UsingPCRE)
+ cached_engines &= ~(1<<kEnginePCRE);
+ for (int i = 0; i < kEngineMax; i++) {
+ if (cached_engines & (1<<i))
+ LOG(INFO) << EngineString(static_cast<Engine>(i))
+ << " enabled";
+ }
+ did_parse = true;
+ return cached_engines;
+}
+
+// The result of running a match.
+struct TestInstance::Result {
+ bool skipped; // test skipped: wasn't applicable
+ bool matched; // found a match
+ bool untrusted; // don't really trust the answer
+ bool have_submatch; // computed all submatch info
+ bool have_submatch0; // computed just submatch[0]
+ StringPiece submatch[kMaxSubmatch];
+};
+
+typedef TestInstance::Result Result;
+
+// Formats a single capture range s in text in the form (a,b)
+// where a and b are the starting and ending offsets of s in text.
+static string FormatCapture(const StringPiece& text, const StringPiece& s) {
+ if (s.begin() == NULL)
+ return "(?,?)";
+ return StringPrintf("(%d,%d)",
+ static_cast<int>(s.begin() - text.begin()),
+ static_cast<int>(s.end() - text.begin()));
+}
+
+// Returns whether text contains non-ASCII (>= 0x80) bytes.
+static bool NonASCII(const StringPiece& text) {
+ for (int i = 0; i < text.size(); i++)
+ if ((uint8)text[i] >= 0x80)
+ return true;
+ return false;
+}
+
+// Returns string representation of match kind.
+static string FormatKind(Prog::MatchKind kind) {
+ switch (kind) {
+ case Prog::kFullMatch:
+ return "full match";
+ case Prog::kLongestMatch:
+ return "longest match";
+ case Prog::kFirstMatch:
+ return "first match";
+ }
+ return "???";
+}
+
+// Returns string representation of anchor kind.
+static string FormatAnchor(Prog::Anchor anchor) {
+ switch (anchor) {
+ case Prog::kAnchored:
+ return "anchored";
+ case Prog::kUnanchored:
+ return "unanchored";
+ }
+ return "???";
+}
+
+struct ParseMode {
+ Regexp::ParseFlags parse_flags;
+ string desc;
+};
+
+static const Regexp::ParseFlags single_line =
+ Regexp::LikePerl;
+static const Regexp::ParseFlags multi_line =
+ static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
+
+static ParseMode parse_modes[] = {
+ { single_line, "single-line" },
+ { single_line|Regexp::Latin1, "single-line, latin1" },
+ { multi_line, "multiline" },
+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
+ { multi_line|Regexp::Latin1, "multiline, latin1" },
+};
+
+static string FormatMode(Regexp::ParseFlags flags) {
+ for (int i = 0; i < arraysize(parse_modes); i++)
+ if (parse_modes[i].parse_flags == flags)
+ return parse_modes[i].desc;
+ return StringPrintf("%#x", static_cast<uint>(flags));
+}
+
+// Constructs and saves all the matching engines that
+// will be required for the given tests.
+TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
+ Regexp::ParseFlags flags)
+ : regexp_str_(regexp_str),
+ kind_(kind),
+ flags_(flags),
+ error_(false),
+ regexp_(NULL),
+ prog_(NULL),
+ rprog_(NULL),
+ re_(NULL),
+ re2_(NULL) {
+
+ VLOG(1) << CEscape(regexp_str);
+
+ // Compile regexp to prog.
+ // Always required - needed for backtracking (reference implementation).
+ RegexpStatus status;
+ regexp_ = Regexp::Parse(regexp_str, flags, &status);
+ if (regexp_ == NULL) {
+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+ << " mode: " << FormatMode(flags);
+ error_ = true;
+ return;
+ }
+ prog_ = regexp_->CompileToProg(0);
+ if (prog_ == NULL) {
+ LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (FLAGS_dump_prog) {
+ LOG(INFO) << "Prog for "
+ << " regexp "
+ << CEscape(regexp_str_)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatMode(flags_)
+ << ")\n"
+ << prog_->Dump();
+ }
+
+ // Compile regexp to reversed prog. Only needed for DFA engines.
+ if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
+ rprog_ = regexp_->CompileToReverseProg(0);
+ if (rprog_ == NULL) {
+ LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (FLAGS_dump_rprog)
+ LOG(INFO) << rprog_->Dump();
+ }
+
+ // Create re string that will be used for RE and RE2.
+ string re = regexp_str.as_string();
+ // Accomodate flags.
+ // Regexp::Latin1 will be accomodated below.
+ if (!(flags & Regexp::OneLine))
+ re = "(?m)" + re;
+ if (flags & Regexp::NonGreedy)
+ re = "(?U)" + re;
+ if (flags & Regexp::DotNL)
+ re = "(?s)" + re;
+
+ // Compile regexp to RE2.
+ if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
+ RE2::Options options;
+ if (flags & Regexp::Latin1)
+ options.set_encoding(RE2::Options::EncodingLatin1);
+ if (kind_ == Prog::kLongestMatch)
+ options.set_longest_match(true);
+ re2_ = new RE2(re, options);
+ if (!re2_->error().empty()) {
+ LOG(INFO) << "Cannot RE2: " << CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+
+ // Compile regexp to RE.
+ // PCRE as exposed by the RE interface isn't always usable.
+ // 1. It disagrees about handling of empty-string reptitions
+ // like matching (a*)* against "b". PCRE treats the (a*) as
+ // occurring once, while we treat it as occurring not at all.
+ // 2. It treats $ as this weird thing meaning end of string
+ // or before the \n at the end of the string.
+ // 3. It doesn't implement POSIX leftmost-longest matching.
+ // MimicsPCRE() detects 1 and 2.
+ if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
+ kind_ != Prog::kLongestMatch) {
+ PCRE_Options o;
+ o.set_option(PCRE::UTF8);
+ if (flags & Regexp::Latin1)
+ o.set_option(PCRE::None);
+ // PCRE has interface bug keeping us from finding $0, so
+ // add one more layer of parens.
+ re_ = new PCRE("("+re+")", o);
+ if (!re_->error().empty()) {
+ LOG(INFO) << "Cannot PCRE: " << CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+}
+
+TestInstance::~TestInstance() {
+ if (regexp_)
+ regexp_->Decref();
+ if (prog_)
+ delete prog_;
+ if (rprog_)
+ delete rprog_;
+ if (re_)
+ delete re_;
+ if (re2_)
+ delete re2_;
+}
+
+// Runs a single search using the named engine type.
+// This interface hides all the irregularities of the various
+// engine interfaces from the rest of this file.
+void TestInstance::RunSearch(Engine type,
+ const StringPiece& orig_text,
+ const StringPiece& orig_context,
+ Prog::Anchor anchor,
+ Result *result) {
+ memset(result, 0, sizeof *result);
+ if (regexp_ == NULL) {
+ result->skipped = true;
+ return;
+ }
+ int nsubmatch = 1 + regexp_->NumCaptures(); // NumCaptures doesn't count $0
+ if (nsubmatch > kMaxSubmatch)
+ nsubmatch = kMaxSubmatch;
+
+ StringPiece text = orig_text;
+ StringPiece context = orig_context;
+
+ switch (type) {
+ default:
+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+
+ case kEngineBacktrack:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineNFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchNFA(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineDFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
+ &result->skipped);
+ break;
+
+ case kEngineDFA1:
+ if (prog_ == NULL || rprog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
+ &result->skipped);
+ // If anchored, no need for second run,
+ // but do it anyway to find more bugs.
+ if (result->matched) {
+ if (!rprog_->SearchDFA(result->submatch[0], context,
+ Prog::kAnchored, Prog::kLongestMatch,
+ result->submatch,
+ &result->skipped)) {
+ LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
+ << " on " << CEscape(text);
+ result->matched = false;
+ }
+ }
+ result->have_submatch0 = true;
+ break;
+
+ case kEngineOnePass:
+ if (prog_ == NULL ||
+ anchor == Prog::kUnanchored ||
+ !prog_->IsOnePass() ||
+ nsubmatch > Prog::kMaxOnePassCapture) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineBitState:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchBitState(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineRE2:
+ case kEngineRE2a:
+ case kEngineRE2b: {
+ if (!re2_ || text.end() != context.end()) {
+ result->skipped = true;
+ break;
+ }
+
+ RE2::Anchor re_anchor;
+ if (anchor == Prog::kAnchored)
+ re_anchor = RE2::ANCHOR_START;
+ else
+ re_anchor = RE2::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ re_anchor = RE2::ANCHOR_BOTH;
+
+ result->matched = re2_->Match(context, text.begin() - context.begin(),
+ re_anchor, result->submatch, nsubmatch);
+ result->have_submatch = nsubmatch > 0;
+ break;
+ }
+
+ case kEnginePCRE: {
+ if (!re_ || text.begin() != context.begin() ||
+ text.end() != context.end()) {
+ result->skipped = true;
+ break;
+ }
+
+ const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
+ PCRE::Arg *a = new PCRE::Arg[nsubmatch];
+ for (int i = 0; i < nsubmatch; i++) {
+ a[i] = PCRE::Arg(&result->submatch[i]);
+ argptr[i] = &a[i];
+ }
+ int consumed;
+ PCRE::Anchor pcre_anchor;
+ if (anchor == Prog::kAnchored)
+ pcre_anchor = PCRE::ANCHOR_START;
+ else
+ pcre_anchor = PCRE::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ pcre_anchor = PCRE::ANCHOR_BOTH;
+ re_->ClearHitLimit();
+ result->matched =
+ re_->DoMatch(text,
+ pcre_anchor,
+ &consumed,
+ argptr, nsubmatch);
+ if (re_->HitLimit()) {
+ result->untrusted = true;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ result->have_submatch = true;
+
+ // Work around RE interface bug: PCRE returns -1 as the
+ // offsets for an unmatched subexpression, and RE should
+ // turn that into StringPiece(NULL) but in fact it uses
+ // StringPiece(text.begin() - 1, 0). Oops.
+ for (int i = 0; i < nsubmatch; i++)
+ if (result->submatch[i].begin() == text.begin() - 1)
+ result->submatch[i] = NULL;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ }
+
+ if (!result->matched)
+ memset(result->submatch, 0, sizeof result->submatch);
+}
+
+// Checks whether r is okay given that correct is the right answer.
+// Specifically, r's answers have to match (but it doesn't have to
+// claim to have all the answers).
+static bool ResultOkay(const Result& r, const Result& correct) {
+ if (r.skipped)
+ return true;
+ if (r.matched != correct.matched)
+ return false;
+ if (r.have_submatch || r.have_submatch0) {
+ for (int i = 0; i < kMaxSubmatch; i++) {
+ if (correct.submatch[i].begin() != r.submatch[i].begin() ||
+ correct.submatch[i].size() != r.submatch[i].size())
+ return false;
+ if (!r.have_submatch)
+ break;
+ }
+ }
+ return true;
+}
+
+// Runs a single test.
+bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+ // Backtracking is the gold standard.
+ Result correct;
+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+ if (correct.skipped) {
+ if (regexp_ == NULL)
+ return true;
+ LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+ << " " << FormatMode(flags_);
+ return false;
+ }
+ VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
+ << " text " << CEscape(text)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+
+ // Compare the others.
+ bool all_okay = true;
+ for (int i = kEngineBacktrack+1; i < kEngineMax; i++) {
+ if (!(Engines() & (1<<i)))
+ continue;
+
+ Result r;
+ RunSearch(static_cast<Engine>(i), text, context, anchor, &r);
+ if (ResultOkay(r, correct)) {
+ if (FLAGS_log_okay) {
+ LOG(INFO) << (r.skipped ? "Skipped: " : "Okay: ")
+ << EngineString(static_cast<Engine>(i))
+ << " regexp "
+ << CEscape(regexp_str_)
+ << " "
+ << CEscape(regexp_->ToString())
+ << " "
+ << " text "
+ << CEscape(text)
+ << " context "
+ << CEscape(context)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+ }
+ continue;
+ }
+
+ // We disagree with PCRE on the meaning of some Unicode matches.
+ // In particular, we treat all non-ASCII UTF-8 as word characters.
+ // We also treat "empty" character sets like [^\w\W] as being
+ // impossible to match, while PCRE apparently excludes some code
+ // points (e.g., 0x0080) from both \w and \W.
+ if (i == kEnginePCRE && NonASCII(text))
+ continue;
+
+ if (!r.untrusted)
+ all_okay = false;
+
+ LOG(INFO) << (r.untrusted ? "(Untrusted) " : "")
+ << "Mismatch: "
+ << EngineString(static_cast<Engine>(i))
+ << " regexp "
+ << CEscape(regexp_str_)
+ << " "
+ << CEscape(regexp_->ToString())
+ << " text "
+ << CEscape(text)
+ << " context "
+ << CEscape(context)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+
+ if (r.matched != correct.matched) {
+ if (r.matched) {
+ LOG(INFO) << " Should not match (but does).";
+ } else {
+ LOG(INFO) << " Should match (but does not).";
+ continue;
+ }
+ }
+ for (int i = 0; i < 1+regexp_->NumCaptures(); i++) {
+ if (r.submatch[i].begin() != correct.submatch[i].begin() ||
+ r.submatch[i].end() != correct.submatch[i].end()) {
+ LOG(INFO) <<
+ StringPrintf(" $%d: should be %s is %s",
+ i,
+ FormatCapture(text, correct.submatch[i]).c_str(),
+ FormatCapture(text, r.submatch[i]).c_str());
+ } else {
+ LOG(INFO) <<
+ StringPrintf(" $%d: %s ok", i,
+ FormatCapture(text, r.submatch[i]).c_str());
+ }
+ }
+ }
+
+ if (!all_okay) {
+ if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
+ LOG(QFATAL) << "Too many regexp failures.";
+ }
+
+ return all_okay;
+}
+
+static Prog::MatchKind kinds[] = {
+ Prog::kFirstMatch,
+ Prog::kLongestMatch,
+ Prog::kFullMatch,
+};
+
+// Test all possible match kinds and parse modes.
+Tester::Tester(const StringPiece& regexp) {
+ error_ = false;
+ for (int i = 0; i < arraysize(kinds); i++) {
+ for (int j = 0; j < arraysize(parse_modes); j++) {
+ TestInstance* t = new TestInstance(regexp, kinds[i],
+ parse_modes[j].parse_flags);
+ error_ |= t->error();
+ v_.push_back(t);
+ }
+ }
+}
+
+Tester::~Tester() {
+ for (int i = 0; i < v_.size(); i++)
+ delete v_[i];
+}
+
+bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+ bool okay = true;
+ for (int i = 0; i < v_.size(); i++)
+ okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
+ return okay;
+}
+
+static Prog::Anchor anchors[] = {
+ Prog::kAnchored,
+ Prog::kUnanchored
+};
+
+bool Tester::TestInput(const StringPiece& text) {
+ return TestInputInContext(text, text);
+}
+
+bool Tester::TestInputInContext(const StringPiece& text,
+ const StringPiece& context) {
+ bool okay = true;
+ for (int i = 0; i < arraysize(anchors); i++)
+ okay &= TestCase(text, context, anchors[i]);
+ return okay;
+}
+
+bool TestRegexpOnText(const StringPiece& regexp,
+ const StringPiece& text) {
+ Tester t(regexp);
+ return t.TestInput(text);
+}
+
+} // namespace re2
diff --git a/re2/testing/tester.h b/re2/testing/tester.h
new file mode 100644
index 0000000..612bb46
--- /dev/null
+++ b/re2/testing/tester.h
@@ -0,0 +1,107 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Comparative tester for regular expression matching.
+// Checks all implementations against each other.
+
+#ifndef RE2_TESTING_TESTER_H__
+#define RE2_TESTING_TESTER_H__
+
+#include "re2/stringpiece.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
+#include "re2/re2.h"
+#include "util/pcre.h"
+
+namespace re2 {
+
+class Regexp;
+
+// All the supported regexp engines.
+enum Engine {
+ kEngineBacktrack = 0, // Prog::BadSearchBacktrack
+ kEngineNFA, // Prog::SearchNFA
+ kEngineDFA, // Prog::SearchDFA, only ask whether it matched
+ kEngineDFA1, // Prog::SearchDFA, ask for match[0]
+ kEngineOnePass, // Prog::SearchOnePass, if applicable
+ kEngineBitState, // Prog::SearchBitState
+ kEngineRE2, // RE2, all submatches
+ kEngineRE2a, // RE2, only ask for match[0]
+ kEngineRE2b, // RE2, only ask whether it matched
+ kEnginePCRE, // PCRE (util/pcre.h)
+
+ kEngineMax,
+};
+
+// A TestInstance caches per-regexp state for a given
+// regular expression in a given configuration
+// (UTF-8 vs Latin1, longest vs first match, etc.).
+class TestInstance {
+ public:
+ struct Result;
+
+ TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
+ Regexp::ParseFlags flags);
+ ~TestInstance();
+ Regexp::ParseFlags flags() { return flags_; }
+ bool error() { return error_; }
+
+ // Runs a single test case: search in text, which is in context,
+ // using the given anchoring.
+ bool RunCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor);
+
+ private:
+ // Runs a single search using the named engine type.
+ void RunSearch(Engine type,
+ const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor,
+ Result *result);
+
+ const StringPiece& regexp_str_; // regexp being tested
+ Prog::MatchKind kind_; // kind of match
+ Regexp::ParseFlags flags_; // flags for parsing regexp_str_
+ bool error_; // error during constructor?
+
+ Regexp* regexp_; // parsed regexp
+ Prog* prog_; // compiled program
+ Prog* rprog_; // compiled reverse program
+ PCRE* re_; // PCRE implementation
+ RE2* re2_; // RE2 implementation
+
+ DISALLOW_EVIL_CONSTRUCTORS(TestInstance);
+};
+
+// A group of TestInstances for all possible configurations.
+class Tester {
+ public:
+ explicit Tester(const StringPiece& regexp);
+ ~Tester();
+
+ bool error() { return error_; }
+
+ // Runs a single test case: search in text, which is in context,
+ // using the given anchoring.
+ bool TestCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor);
+
+ // Run TestCase(text, text, anchor) for all anchoring modes.
+ bool TestInput(const StringPiece& text);
+
+ // Run TestCase(text, context, anchor) for all anchoring modes.
+ bool TestInputInContext(const StringPiece& text, const StringPiece& context);
+
+ private:
+ bool error_;
+ vector<TestInstance*> v_;
+
+ DISALLOW_EVIL_CONSTRUCTORS(Tester);
+};
+
+// Run all possible tests using regexp and text.
+bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
+
+} // namespace re2
+
+#endif // RE2_TESTING_TESTER_H__
diff --git a/re2/testing/unicode_test.py b/re2/testing/unicode_test.py
new file mode 100755
index 0000000..a88a3ad
--- /dev/null
+++ b/re2/testing/unicode_test.py
@@ -0,0 +1,207 @@
+#!/usr/bin/python2.4
+#
+# Copyright 2008 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Unittest for the util/regexp/re2/unicode.py module."""
+
+import os
+import StringIO
+from google3.pyglib import flags
+from google3.testing.pybase import googletest
+from google3.util.regexp.re2 import unicode
+
+_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party",
+ "unicode", "ucd-5.1.0")
+
+
+class ConvertTest(googletest.TestCase):
+ """Test the conversion functions."""
+
+ def testUInt(self):
+ self.assertEquals(0x0000, unicode._UInt("0000"))
+ self.assertEquals(0x263A, unicode._UInt("263A"))
+ self.assertEquals(0x10FFFF, unicode._UInt("10FFFF"))
+ self.assertRaises(unicode.InputError, unicode._UInt, "263")
+ self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA")
+ self.assertRaises(unicode.InputError, unicode._UInt, "110000")
+
+ def testURange(self):
+ self.assertEquals([1, 2, 3], unicode._URange("0001..0003"))
+ self.assertEquals([1], unicode._URange("0001"))
+ self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005")
+ self.assertRaises(unicode.InputError, unicode._URange, "0003..0001")
+ self.assertRaises(unicode.InputError, unicode._URange, "0001..0001")
+
+ def testUStr(self):
+ self.assertEquals("0x263A", unicode._UStr(0x263a))
+ self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF))
+ self.assertRaises(unicode.InputError, unicode._UStr, 0x110000)
+ self.assertRaises(unicode.InputError, unicode._UStr, -1)
+
+
+_UNICODE_TABLE = """# Commented line, should be ignored.
+# The next line is blank and should be ignored.
+
+0041;Capital A;Line 1
+0061..007A;Lowercase;Line 2
+1F00;<Greek, First>;Ignored
+1FFE;<Greek, Last>;Line 3
+10FFFF;Runemax;Line 4
+0000;Zero;Line 5
+"""
+
+_BAD_TABLE1 = """
+111111;Not a code point;
+"""
+
+_BAD_TABLE2 = """
+0000;<Zero, First>;Missing <Zero, Last>
+"""
+
+_BAD_TABLE3 = """
+0010..0001;Bad range;
+"""
+
+
+class AbortError(Exception):
+ """Function should not have been called."""
+
+
+def Abort():
+ raise AbortError("Abort")
+
+
+def StringTable(s, n, f):
+ unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f)
+
+
+class ReadUnicodeTableTest(googletest.TestCase):
+ """Test the ReadUnicodeTable function."""
+
+ def testSimpleTable(self):
+
+ ncall = [0] # can't assign to ordinary int in DoLine
+
+ def DoLine(codes, fields):
+ self.assertEquals(3, len(fields))
+ ncall[0] += 1
+ self.assertEquals("Line %d" % (ncall[0],), fields[2])
+ if ncall[0] == 1:
+ self.assertEquals([0x0041], codes)
+ self.assertEquals("0041", fields[0])
+ self.assertEquals("Capital A", fields[1])
+ elif ncall[0] == 2:
+ self.assertEquals(range(0x0061, 0x007A + 1), codes)
+ self.assertEquals("0061..007A", fields[0])
+ self.assertEquals("Lowercase", fields[1])
+ elif ncall[0] == 3:
+ self.assertEquals(range(0x1F00, 0x1FFE + 1), codes)
+ self.assertEquals("1F00..1FFE", fields[0])
+ self.assertEquals("Greek", fields[1])
+ elif ncall[0] == 4:
+ self.assertEquals([0x10FFFF], codes)
+ self.assertEquals("10FFFF", fields[0])
+ self.assertEquals("Runemax", fields[1])
+ elif ncall[0] == 5:
+ self.assertEquals([0x0000], codes)
+ self.assertEquals("0000", fields[0])
+ self.assertEquals("Zero", fields[1])
+
+ StringTable(_UNICODE_TABLE, 3, DoLine)
+ self.assertEquals(5, ncall[0])
+
+ def testErrorTables(self):
+ self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort)
+ self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort)
+ self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort)
+ self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort)
+ self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort)
+
+
+class ParseContinueTest(googletest.TestCase):
+ """Test the ParseContinue function."""
+
+ def testParseContinue(self):
+ self.assertEquals(("Private Use", "First"),
+ unicode._ParseContinue("<Private Use, First>"))
+ self.assertEquals(("Private Use", "Last"),
+ unicode._ParseContinue("<Private Use, Last>"))
+ self.assertEquals(("<Private Use, Blah>", None),
+ unicode._ParseContinue("<Private Use, Blah>"))
+
+
+class CaseGroupsTest(googletest.TestCase):
+ """Test the CaseGroups function (and the CaseFoldingReader)."""
+
+ def FindGroup(self, c):
+ if type(c) == str:
+ c = ord(c)
+ for g in self.groups:
+ if c in g:
+ return g
+ return None
+
+ def testCaseGroups(self):
+ self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR)
+ self.assertEquals([ord("A"), ord("a")], self.FindGroup("a"))
+ self.assertEquals(None, self.FindGroup("0"))
+
+
+class ScriptsTest(googletest.TestCase):
+ """Test the Scripts function (and the ScriptsReader)."""
+
+ def FindScript(self, c):
+ if type(c) == str:
+ c = ord(c)
+ for script, codes in self.scripts.items():
+ for code in codes:
+ if c == code:
+ return script
+ return None
+
+ def testScripts(self):
+ self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR)
+ self.assertEquals("Latin", self.FindScript("a"))
+ self.assertEquals("Common", self.FindScript("0"))
+ self.assertEquals(None, self.FindScript(0xFFFE))
+
+
+class CategoriesTest(googletest.TestCase):
+ """Test the Categories function (and the UnicodeDataReader)."""
+
+ def FindCategory(self, c):
+ if type(c) == str:
+ c = ord(c)
+ short = None
+ for category, codes in self.categories.items():
+ for code in codes:
+ if code == c:
+ # prefer category Nd over N
+ if len(category) > 1:
+ return category
+ if short == None:
+ short = category
+ return short
+
+ def testCategories(self):
+ self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR)
+ self.assertEquals("Ll", self.FindCategory("a"))
+ self.assertEquals("Nd", self.FindCategory("0"))
+ self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range
+ self.assertEquals(None, self.FindCategory(0xFFFE))
+ self.assertEquals("Lo", self.FindCategory(0x8B5A))
+ self.assertEquals("Lo", self.FindCategory(0x6C38))
+ self.assertEquals("Lo", self.FindCategory(0x92D2))
+ self.assertTrue(ord("a") in self.categories["L"])
+ self.assertTrue(ord("0") in self.categories["N"])
+ self.assertTrue(0x8B5A in self.categories["L"])
+ self.assertTrue(0x6C38 in self.categories["L"])
+ self.assertTrue(0x92D2 in self.categories["L"])
+
+def main():
+ googletest.main()
+
+if __name__ == "__main__":
+ main()