bug fix: handle alternation involving a regexp
ending in \C* (or .* in Latin-1 mode) correctly
Bug only affected calls to Match, not FullMatch or PartialMatch.
R=rsc
CC=re2-dev
http://codereview.appspot.com/1690054
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 3bea979..3f5e4b0 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -586,6 +586,7 @@
int n = 0;
uint needflags = 0; // flags needed by kInstEmptyWidth instructions
bool sawmatch = false; // whether queue contains guaranteed kInstMatch
+ bool sawmark = false; // whether queue contains a Mark
if (DebugDFA)
fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
@@ -593,17 +594,26 @@
if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
break;
if (q->is_mark(id)) {
- if (n > 0 && inst[n-1] != Mark)
+ if (n > 0 && inst[n-1] != Mark) {
+ sawmark = true;
inst[n++] = Mark;
+ }
continue;
}
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstAltMatch:
+ // This state will continue to a match no matter what
+ // the rest of the input is. If it is the highest priority match
+ // being considered, return the special FullMatchState
+ // to indicate that it's all matches from here out.
if (kind_ != Prog::kManyMatch &&
(kind_ != Prog::kFirstMatch ||
- (it == q->begin() && ip->greedy(prog_)))) {
+ (it == q->begin() && ip->greedy(prog_))) &&
+ (kind_ != Prog::kLongestMatch || !sawmark)) {
delete[] inst;
+ if (DebugDFA)
+ fprintf(stderr, " -> FullMatchState\n");
return FullMatchState;
}
// Fall through.
@@ -663,6 +673,8 @@
// if the state is *not* a matching state.
if (n == 0 && flag == 0) {
delete[] inst;
+ if (DebugDFA)
+ fprintf(stderr, " -> DeadState\n");
return DeadState;
}
diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc
index acb338a..acd6432 100644
--- a/re2/testing/dfa_test.cc
+++ b/re2/testing/dfa_test.cc
@@ -100,7 +100,10 @@
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
- for (int i = 17; i < 28; i++) {
+ int max = 28;
+ if (DEBUG_MODE)
+ max = 24;
+ for (int i = 17; i < max; i++) {
int limit = 1<<i;
int usage, progusage, dfamem;
{
diff --git a/re2/testing/random_test.cc b/re2/testing/random_test.cc
index 6d6b5eb..91d2b32 100644
--- a/re2/testing/random_test.cc
+++ b/re2/testing/random_test.cc
@@ -23,6 +23,14 @@
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper) {
+ // Limit to smaller test cases in debug mode,
+ // because everything is so much slower.
+ if (DEBUG_MODE) {
+ maxatoms--;
+ maxops--;
+ maxstrlen /= 2;
+ }
+
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
@@ -35,14 +43,14 @@
// Tests random small regexps involving literals and egrep operators.
TEST(Random, SmallEgrepLiterals) {
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
- 20, Explode("abc"),
+ 15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals and egrep operators.
TEST(Random, BigEgrepLiterals) {
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
- 20, Explode("abc"),
+ 15, Explode("abc"),
"");
}
@@ -50,7 +58,7 @@
// and egrep operators.
TEST(Random, SmallEgrepCaptures) {
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
- 20, Explode("abc"),
+ 15, Explode("abc"),
"");
}
@@ -58,7 +66,7 @@
// and egrep operators.
TEST(Random, BigEgrepCaptures) {
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
- 20, Explode("abc"),
+ 15, Explode("abc"),
"");
}
diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
index 419d7ba..588e5d5 100644
--- a/re2/testing/search_test.cc
+++ b/re2/testing/search_test.cc
@@ -264,6 +264,8 @@
{ "[^\\\\]+", "Aa\\" },
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
+ // Former bugs.
+ { "a\\C*|ba\\C", "baba" },
};
TEST(Regexp, SearchTests) {