factor common prefixes from alternations
Juggle code so that RE2::Set can benefit too.
R=rsc
CC=re2-dev
http://codereview.appspot.com/1740045
diff --git a/re2/regexp.cc b/re2/regexp.cc
index e988f8a..6ecf834 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -126,6 +126,8 @@
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
+ if (sub == NULL)
+ continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
@@ -160,8 +162,13 @@
runes_[nrunes_++] = r;
}
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+ re->match_id_ = match_id;
+ return re;
+}
-Regexp* Regexp::Plus(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
@@ -170,7 +177,7 @@
return re;
}
-Regexp* Regexp::Star(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
@@ -179,7 +186,7 @@
return re;
}
-Regexp* Regexp::Quest(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
@@ -188,10 +195,25 @@
return re;
}
-Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, ParseFlags flags) {
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+ ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
+ Regexp** subcopy = NULL;
+ if (op == kRegexpAlternate && can_factor) {
+ // Going to edit sub; make a copy so we don't step on caller.
+ subcopy = new Regexp*[nsub];
+ memmove(subcopy, sub, nsub * sizeof sub[0]);
+ sub = subcopy;
+ nsub = FactorAlternation(sub, nsub, flags);
+ if (nsub == 1) {
+ Regexp* re = sub[0];
+ delete[] subcopy;
+ return re;
+ }
+ }
+
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
@@ -200,8 +222,11 @@
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
- subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags);
- subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags);
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+ nsub - (nbigsub-1)*kMaxNsub, flags,
+ false);
+ delete[] subcopy;
return re;
}
@@ -210,15 +235,21 @@
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
+
+ delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags);
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags);
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
@@ -261,6 +292,159 @@
return re;
}
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+ // Can use memmove because Regexp is just a struct (no vtable).
+ char tmp[sizeof *this];
+ memmove(tmp, this, sizeof tmp);
+ memmove(this, that, sizeof tmp);
+ memmove(that, tmp, sizeof tmp);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+ if (a->op() != b->op())
+ return false;
+
+ switch (a->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpBeginText:
+ return true;
+
+ case kRegexpEndText:
+ // The parse flags remember whether it's \z or (?-m:$),
+ // which matters when testing against PCRE.
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+ case kRegexpLiteral:
+ return a->rune() == b->rune() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+ case kRegexpLiteralString:
+ return a->nrunes() == b->nrunes() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+ memcmp(a->runes(), b->runes(),
+ a->nrunes() * sizeof a->runes()[0]) == 0;
+
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ return a->nsub() == b->nsub();
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+ case kRegexpRepeat:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+ a->min() == b->min() &&
+ a->max() == b->max();
+
+ case kRegexpCapture:
+ return a->cap() == b->cap() && a->name() == b->name();
+
+ case kRegexpHaveMatch:
+ return a->match_id() == b->match_id();
+
+ case kRegexpCharClass: {
+ CharClass* acc = a->cc();
+ CharClass* bcc = b->cc();
+ return acc->size() == bcc->size() &&
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+ memcmp(acc->begin(), bcc->begin(),
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+ }
+ }
+
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+ return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+ if (a == NULL || b == NULL)
+ return a == b;
+
+ if (!TopEqual(a, b))
+ return false;
+
+ // Fast path:
+ // return without allocating vector if there are no subregexps.
+ switch (a->op()) {
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ break;
+
+ default:
+ return true;
+ }
+
+ // Committed to doing real work.
+ // The stack (vector) has pairs of regexps waiting to
+ // be compared. The regexps are only equal if
+ // all the pairs end up being equal.
+ vector<Regexp*> stk;
+
+ for (;;) {
+ // Invariant: TopEqual(a, b) == true.
+ Regexp* a2;
+ Regexp* b2;
+ switch (a->op()) {
+ default:
+ break;
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ for (int i = 0; i < a->nsub(); i++) {
+ a2 = a->sub()[i];
+ b2 = b->sub()[i];
+ if (!TopEqual(a2, b2))
+ return false;
+ stk.push_back(a2);
+ stk.push_back(b2);
+ }
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ a2 = a->sub()[0];
+ b2 = b->sub()[0];
+ if (!TopEqual(a2, b2))
+ return false;
+ // Really:
+ // stk.push_back(a2);
+ // stk.push_back(b2);
+ // break;
+ // but faster to assign directly and loop.
+ a = a2;
+ b = b2;
+ continue;
+ }
+
+ int n = stk.size();
+ if (n == 0)
+ break;
+
+ a = stk[n-2];
+ b = stk[n-1];
+ stk.resize(n-2);
+ }
+
+ return true;
+}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const string kErrorStrings[] = {