factor common prefixes from alternations
Juggle code so that RE2::Set can benefit too.
R=rsc
CC=re2-dev
http://codereview.appspot.com/1740045
diff --git a/re2/compile.cc b/re2/compile.cc
index bbc4607..7b54c2d 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -93,7 +93,7 @@
}
Prog::Inst* ip = &inst0[l.p>>1];
- if(l.p&1)
+ if (l.p&1)
ip->out1_ = l2.p;
else
ip->set_out(l2.p);
@@ -132,7 +132,7 @@
// Compiles alternation of all the re to a new Prog.
// Each re has a match with an id equal to its index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re);
+ Regexp* re);
// Interface for Regexp::Walker, which helps traverse the Regexp.
// The walk is purely post-recursive: given the machines for the
@@ -207,7 +207,7 @@
// Single rune.
Frag Literal(Rune r, bool foldcase);
- void Setup(Regexp::ParseFlags, int64);
+ void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
Prog* Finish();
// Returns .* where dot = any byte
@@ -230,6 +230,8 @@
map<uint64, int> rune_cache_;
Frag rune_range_;
+ RE2::Anchor anchor_; // anchor mode for RE2::Set
+
DISALLOW_EVIL_CONSTRUCTORS(Compiler);
};
@@ -478,7 +480,8 @@
rune_range_.end = nullPatchList;
}
-int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
+int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
+ int next) {
Frag f = ByteRange(lo, hi, foldcase);
if (next != 0) {
PatchList::Patch(inst_, f.end, next);
@@ -492,7 +495,9 @@
// In Latin1 mode, there's no point in caching.
// In forward UTF-8 mode, only need to cache continuation bytes.
if (encoding_ == kEncodingLatin1 ||
- (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf))) {
+ (encoding_ == kEncodingUTF8 &&
+ !reversed_ &&
+ !(0x80 <= lo && hi <= 0xbf))) {
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
}
@@ -723,6 +728,14 @@
case kRegexpEmptyMatch:
return Nop();
+ case kRegexpHaveMatch: {
+ Frag f = Match(re->match_id());
+ // Remember unanchored match to end of string.
+ if (anchor_ != RE2::ANCHOR_BOTH)
+ f = Cat(DotStar(), f);
+ return f;
+ }
+
case kRegexpConcat: {
Frag f = child_frags[0];
for (int i = 1; i < nchild_frags; i++)
@@ -896,7 +909,8 @@
}
}
-void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) {
+void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
+ RE2::Anchor anchor) {
prog_->set_flags(flags);
if (flags & Regexp::Latin1)
@@ -926,6 +940,8 @@
max_inst_ = m;
}
+
+ anchor_ = anchor;
}
// Compiles re, returning program.
@@ -936,7 +952,7 @@
Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
Compiler c;
- c.Setup(re->parse_flags(), max_mem);
+ c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
c.reversed_ = reversed;
// Simplify to remove things like counted repetitions
@@ -1035,30 +1051,22 @@
// Compiles RE set to Prog.
Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re) {
+ Regexp* re) {
Compiler c;
- c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()),
- options.max_mem());
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
+ c.Setup(pf, options.max_mem(), anchor);
- // Accumulate alternation of fragments.
- Frag all = kNullFrag;
- for (int i = 0; i < re.size(); i++) {
- Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_);
- if (c.failed_)
- return NULL;
- switch (anchor) {
- case RE2::UNANCHORED:
- f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar());
- break;
- case RE2::ANCHOR_START:
- f = c.Cat(f, c.DotStar());
- break;
- case RE2::ANCHOR_BOTH:
- break;
- }
- f = c.Cat(f, c.Match(i));
- all = c.Alt(all, f);
+ // Compile alternation of fragments.
+ Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
+ re->Decref();
+ if (c.failed_)
+ return NULL;
+
+ if (anchor == RE2::UNANCHORED) {
+ // The trailing .* was added while handling kRegexpHaveMatch.
+ // We just have to add the leading one.
+ all = c.Cat(c.DotStar(), all);
}
c.prog_->set_start(all.begin);
@@ -1066,14 +1074,6 @@
c.prog_->set_anchor_start(true);
c.prog_->set_anchor_end(true);
- // Also create unanchored version, which starts with a .*? loop.
- if (c.prog_->anchor_start()) {
- c.prog_->set_start_unanchored(c.prog_->start());
- } else {
- Frag unanchored = c.Cat(c.DotStar(), all);
- c.prog_->set_start_unanchored(unanchored.begin);
- }
-
Prog* prog = c.Finish();
if (prog == NULL)
return NULL;
@@ -1093,7 +1093,7 @@
}
Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re) {
+ Regexp* re) {
return Compiler::CompileSet(options, anchor, re);
}
diff --git a/re2/mimics_pcre.cc b/re2/mimics_pcre.cc
index c570168..fc6dd4a 100644
--- a/re2/mimics_pcre.cc
+++ b/re2/mimics_pcre.cc
@@ -151,6 +151,7 @@
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
+ case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
diff --git a/re2/parse.cc b/re2/parse.cc
index 36b1c8f..eead3d8 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -134,19 +134,6 @@
bool ParsePerlFlags(StringPiece* s);
- // Returns the leading string that re starts with.
- // The returned Rune* points into a piece of re,
- // so it must not be used after the caller calls re->Decref().
- Rune* LeadingString(Regexp* re, int *nrune, Regexp::ParseFlags *flags);
-
- // Removes the first n leading runes from the beginning of re.
- void RemoveLeadingString(Regexp* re, int n);
-
- // Simplifies an alternation of literal strings by factoring out
- // common prefixes.
- int SimplifyAlternation(Regexp** sub, int n, Regexp::ParseFlags altflags,
- int maxdepth);
-
// Finishes the current concatenation,
// collapsing it into a single regexp on the stack.
void DoConcatenation();
@@ -176,11 +163,6 @@
const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
-// Add a range to the character class, but exclude newline if asked.
-// Also handle case folding.
-static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
- Regexp::ParseFlags parse_flags);
-
Regexp::ParseState::ParseState(ParseFlags flags,
const StringPiece& whole_regexp,
RegexpStatus* status)
@@ -650,11 +632,56 @@
return FinishRegexp(re);
}
+// Returns the leading regexp that re starts with.
+// The returned Regexp* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Regexp* Regexp::LeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return NULL;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return NULL;
+ return sub[0];
+ }
+ return re;
+}
+
+// Removes LeadingRegexp(re) from re and returns what's left.
+// Consumes the reference to re and may edit it in place.
+// If caller wants to hold on to LeadingRegexp(re),
+// must have already Incref'ed it.
+Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return re;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return re;
+ sub[0]->Decref();
+ sub[0] = NULL;
+ if (re->nsub() == 2) {
+ // Collapse concatenation to single regexp.
+ Regexp* nre = sub[1];
+ sub[1] = NULL;
+ re->Decref();
+ return nre;
+ }
+ // 3 or more -> 2 or more.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ return re;
+ }
+ Regexp::ParseFlags pf = re->parse_flags();
+ re->Decref();
+ return new Regexp(kRegexpEmptyMatch, pf);
+}
+
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
-Rune* Regexp::ParseState::LeadingString(Regexp* re, int *nrune,
- Regexp::ParseFlags *flags) {
+Rune* Regexp::LeadingString(Regexp* re, int *nrune,
+ Regexp::ParseFlags *flags) {
while (re->op() == kRegexpConcat && re->nsub() > 0)
re = re->sub()[0];
@@ -676,16 +703,25 @@
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
-void Regexp::ParseState::RemoveLeadingString(Regexp* re, int n) {
- while (re->op() == kRegexpConcat && re->nsub() > 0)
+void Regexp::RemoveLeadingString(Regexp* re, int n) {
+ // Chase down concats to find first string.
+ // For regexps generated by parser, nested concats are
+ // flattened except when doing so would overflow the 16-bit
+ // limit on the size of a concatenation, so we should never
+ // see more than two here.
+ Regexp* stk[4];
+ int d = 0;
+ while (re->op() == kRegexpConcat) {
+ if (d < arraysize(stk))
+ stk[d++] = re;
re = re->sub()[0];
-
- if (re->op() == kRegexpLiteral) {
- re->op_ = kRegexpEmptyMatch;
- return;
}
- if (re->op() == kRegexpLiteralString) {
+ // Remove leading string from re.
+ if (re->op() == kRegexpLiteral) {
+ re->rune_ = 0;
+ re->op_ = kRegexpEmptyMatch;
+ } else if (re->op() == kRegexpLiteralString) {
if (n >= re->nrunes_) {
delete[] re->runes_;
re->runes_ = NULL;
@@ -698,17 +734,50 @@
re->nrunes_ = 0;
re->rune_ = rune;
re->op_ = kRegexpLiteral;
-
} else {
re->nrunes_ -= n;
memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
}
- return;
+ }
+
+ // If re is now empty, concatenations might simplify too.
+ while (d-- > 0) {
+ re = stk[d];
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch) {
+ sub[0]->Decref();
+ sub[0] = NULL;
+ // Delete first element of concat.
+ switch (re->nsub()) {
+ case 0:
+ case 1:
+ // Impossible.
+ LOG(DFATAL) << "Concat of " << re->nsub();
+ re->submany_ = NULL;
+ re->op_ = kRegexpEmptyMatch;
+ break;
+
+ case 2: {
+ // Replace re with sub[1].
+ Regexp* old = sub[1];
+ sub[1] = NULL;
+ re->Swap(old);
+ old->Decref();
+ break;
+ }
+
+ default:
+ // Slide down.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ break;
+ }
+ }
}
}
-// Simplifies an alternation of literal strings by factoring out
-// common prefixes. For example,
+// Factors common prefixes from alternation.
+// For example,
// ABC|ABD|AEF|BCX|BCY
// simplifies to
// A(B(C|D)|EF)|BC(X|Y)
@@ -719,15 +788,22 @@
// the new length of sub. Adjusts reference counts accordingly
// (incoming sub[i] decremented, outgoing sub[i] incremented).
-// It's too much of a pain to explicitly write out the recursion,
+// It's too much of a pain to write this code with an explicit stack,
// so instead we let the caller specify a maximum depth and
// don't simplify beyond that. There are around 15 words of local
// variables and parameters in the frame, so allowing 8 levels
// on a 64-bit machine is still less than a kilobyte of stack and
// probably enough benefit for practical uses.
-const int kSimplifyAlternationMaxDepth = 8;
+const int kFactorAlternationMaxDepth = 8;
-int Regexp::ParseState::SimplifyAlternation(
+int Regexp::FactorAlternation(
+ Regexp** sub, int n,
+ Regexp::ParseFlags altflags) {
+ return FactorAlternationRecursive(sub, n, altflags,
+ kFactorAlternationMaxDepth);
+}
+
+int Regexp::FactorAlternationRecursive(
Regexp** sub, int n,
Regexp::ParseFlags altflags,
int maxdepth) {
@@ -735,7 +811,7 @@
if (maxdepth <= 0)
return n;
- // Round 1: Factor out common prefixes.
+ // Round 1: Factor out common literal prefixes.
Rune *rune = NULL;
int nrune = 0;
Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
@@ -781,9 +857,9 @@
x[0] = LiteralString(rune, nrune, runeflags);
for (int j = start; j < i; j++)
RemoveLeadingString(sub[j], nrune);
- int nn = SimplifyAlternation(sub + start, i - start, altflags,
- maxdepth - 1);
- x[1] = Alternate(sub + start, nn, altflags);
+ int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
+ maxdepth - 1);
+ x[1] = AlternateNoFactor(sub + start, nn, altflags);
sub[out++] = Concat(x, 2, altflags);
}
@@ -797,7 +873,56 @@
}
n = out;
- // Round 2: Collapse runs of single literals into character classes.
+ // Round 2: Factor out common complex prefixes,
+ // just the first piece of each concatenation,
+ // whatever it is. This is good enough a lot of the time.
+ start = 0;
+ out = 0;
+ Regexp* first = NULL;
+ for (int i = 0; i <= n; i++) {
+ // Invariant: what was in sub[0:start] has been Decref'ed
+ // and that space has been reused for sub[0:out] (out <= start).
+ //
+ // Invariant: sub[start:i] consists of regexps that all begin with first.
+
+ Regexp* first_i = NULL;
+ if (i < n) {
+ first_i = LeadingRegexp(sub[i]);
+ if (first != NULL && Regexp::Equal(first, first_i)) {
+ continue;
+ }
+ }
+
+ // Found end of a run with common leading regexp:
+ // sub[start:i] all begin with first but sub[i] does not.
+ //
+ // Factor out common regexp and append factored expression to sub[0:out].
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ sub[out++] = sub[start];
+ } else {
+ // Construct factored form: prefix(suffix1|suffix2|...)
+ Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|...
+ x[0] = first->Incref();
+ for (int j = start; j < i; j++)
+ sub[j] = RemoveLeadingRegexp(sub[j]);
+ int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
+ maxdepth - 1);
+ x[1] = AlternateNoFactor(sub + start, nn, altflags);
+ sub[out++] = Concat(x, 2, altflags);
+ }
+
+ // Prepare for next round (if there is one).
+ if (i < n) {
+ start = i;
+ first = first_i;
+ }
+ }
+ n = out;
+
+ // Round 3: Collapse runs of single literals into character classes.
start = 0;
out = 0;
for (int i = 0; i <= n; i++) {
@@ -817,7 +942,6 @@
if (i == start) {
// Nothing to do.
} else if (i == start+1) {
- // Char class (or not) of one.
sub[out++] = sub[start];
} else {
// Make new char class.
@@ -829,7 +953,7 @@
for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
ccb.AddRange(it->lo, it->hi);
} else if (re->op() == kRegexpLiteral) {
- AddRange(&ccb, re->rune(), re->rune(), re->parse_flags());
+ ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
} else {
LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
<< re->ToString();
@@ -846,6 +970,20 @@
}
n = out;
+ // Round 4: Collapse runs of empty matches into single empty match.
+ start = 0;
+ out = 0;
+ for (int i = 0; i < n; i++) {
+ if (i + 1 < n &&
+ sub[i]->op() == kRegexpEmptyMatch &&
+ sub[i+1]->op() == kRegexpEmptyMatch) {
+ sub[i]->Decref();
+ continue;
+ }
+ sub[out++] = sub[i];
+ }
+ n = out;
+
return n;
}
@@ -886,10 +1024,7 @@
}
}
- if (op == kRegexpAlternate)
- n = SimplifyAlternation(subs, n, flags_, kSimplifyAlternationMaxDepth);
-
- Regexp* re = ConcatOrAlternate(op, subs, n, flags_);
+ Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true);
delete[] subs;
re->simple_ = re->ComputeSimple();
re->down_ = next;
@@ -1241,25 +1376,25 @@
// Add a range to the character class, but exclude newline if asked.
// Also handle case folding.
-static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
- Regexp::ParseFlags parse_flags) {
+void CharClassBuilder::AddRangeFlags(
+ Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
// Take out \n if the flags say so.
bool cutnl = !(parse_flags & Regexp::ClassNL) ||
(parse_flags & Regexp::NeverNL);
if (cutnl && lo <= '\n' && '\n' <= hi) {
if (lo < '\n')
- AddRange(cc, lo, '\n' - 1, parse_flags);
+ AddRangeFlags(lo, '\n' - 1, parse_flags);
if (hi > '\n')
- AddRange(cc, '\n' + 1, hi, parse_flags);
+ AddRangeFlags('\n' + 1, hi, parse_flags);
return;
}
// If folding case, add fold-equivalent characters too.
if (parse_flags & Regexp::FoldCase)
- AddFoldedRange(cc, lo, hi, 0);
+ AddFoldedRange(this, lo, hi, 0);
else
- cc->AddRange(lo, hi);
+ AddRange(lo, hi);
}
// Look for a group with the given name.
@@ -1298,10 +1433,10 @@
static void AddUGroup(CharClassBuilder *cc, UGroup *g,
Regexp::ParseFlags parse_flags) {
for (int i = 0; i < g->nr16; i++) {
- AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags);
+ cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
}
for (int i = 0; i < g->nr32; i++) {
- AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags);
+ cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
}
}
@@ -1311,16 +1446,16 @@
int next = 0;
for (int i = 0; i < g->nr16; i++) {
if (next < g->r16[i].lo)
- AddRange(cc, next, g->r16[i].lo - 1, parse_flags);
+ cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
next = g->r16[i].hi + 1;
}
for (int i = 0; i < g->nr32; i++) {
if (next < g->r32[i].lo)
- AddRange(cc, next, g->r32[i].lo - 1, parse_flags);
+ cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
next = g->r32[i].hi + 1;
}
if (next <= Runemax)
- AddRange(cc, next, Runemax, parse_flags);
+ cc->AddRangeFlags(next, Runemax, parse_flags);
}
// Maybe parse a Perl character class escape sequence.
@@ -1588,12 +1723,12 @@
re->Decref();
return false;
}
- // AddRange is usually called in response to a class like
+ // AddRangeFlags is usually called in response to a class like
// \p{Foo} or [[:foo:]]; for those, it filters \n out unless
// Regexp::ClassNL is set. In an explicit range or singleton
// like we just parsed, we do not filter \n out, so set ClassNL
// in the flags.
- AddRange(re->ccb_, rr.lo, rr.hi, flags_ | Regexp::ClassNL);
+ re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
}
if (s->size() == 0) {
status->set_code(kRegexpMissingBracket);
diff --git a/re2/prog.cc b/re2/prog.cc
index 897a50a..ef9ef23 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -80,7 +80,7 @@
static_cast<int>(empty_), out());
case kInstMatch:
- return StringPrintf("match!");
+ return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
diff --git a/re2/prog.h b/re2/prog.h
index daeafe4..bbebfdf 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -332,7 +332,7 @@
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re);
+ Regexp* re);
private:
friend class Compiler;
diff --git a/re2/regexp.cc b/re2/regexp.cc
index e988f8a..6ecf834 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -126,6 +126,8 @@
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
+ if (sub == NULL)
+ continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
@@ -160,8 +162,13 @@
runes_[nrunes_++] = r;
}
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+ re->match_id_ = match_id;
+ return re;
+}
-Regexp* Regexp::Plus(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
@@ -170,7 +177,7 @@
return re;
}
-Regexp* Regexp::Star(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
@@ -179,7 +186,7 @@
return re;
}
-Regexp* Regexp::Quest(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
@@ -188,10 +195,25 @@
return re;
}
-Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, ParseFlags flags) {
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+ ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
+ Regexp** subcopy = NULL;
+ if (op == kRegexpAlternate && can_factor) {
+ // Going to edit sub; make a copy so we don't step on caller.
+ subcopy = new Regexp*[nsub];
+ memmove(subcopy, sub, nsub * sizeof sub[0]);
+ sub = subcopy;
+ nsub = FactorAlternation(sub, nsub, flags);
+ if (nsub == 1) {
+ Regexp* re = sub[0];
+ delete[] subcopy;
+ return re;
+ }
+ }
+
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
@@ -200,8 +222,11 @@
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
- subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags);
- subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags);
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+ nsub - (nbigsub-1)*kMaxNsub, flags,
+ false);
+ delete[] subcopy;
return re;
}
@@ -210,15 +235,21 @@
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
+
+ delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags);
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags);
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
@@ -261,6 +292,159 @@
return re;
}
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+ // Can use memmove because Regexp is just a struct (no vtable).
+ char tmp[sizeof *this];
+ memmove(tmp, this, sizeof tmp);
+ memmove(this, that, sizeof tmp);
+ memmove(that, tmp, sizeof tmp);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+ if (a->op() != b->op())
+ return false;
+
+ switch (a->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpBeginText:
+ return true;
+
+ case kRegexpEndText:
+ // The parse flags remember whether it's \z or (?-m:$),
+ // which matters when testing against PCRE.
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+ case kRegexpLiteral:
+ return a->rune() == b->rune() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+ case kRegexpLiteralString:
+ return a->nrunes() == b->nrunes() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+ memcmp(a->runes(), b->runes(),
+ a->nrunes() * sizeof a->runes()[0]) == 0;
+
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ return a->nsub() == b->nsub();
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+ case kRegexpRepeat:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+ a->min() == b->min() &&
+ a->max() == b->max();
+
+ case kRegexpCapture:
+ return a->cap() == b->cap() && a->name() == b->name();
+
+ case kRegexpHaveMatch:
+ return a->match_id() == b->match_id();
+
+ case kRegexpCharClass: {
+ CharClass* acc = a->cc();
+ CharClass* bcc = b->cc();
+ return acc->size() == bcc->size() &&
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+ memcmp(acc->begin(), bcc->begin(),
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+ }
+ }
+
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+ return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+ if (a == NULL || b == NULL)
+ return a == b;
+
+ if (!TopEqual(a, b))
+ return false;
+
+ // Fast path:
+ // return without allocating vector if there are no subregexps.
+ switch (a->op()) {
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ break;
+
+ default:
+ return true;
+ }
+
+ // Committed to doing real work.
+ // The stack (vector) has pairs of regexps waiting to
+ // be compared. The regexps are only equal if
+ // all the pairs end up being equal.
+ vector<Regexp*> stk;
+
+ for (;;) {
+ // Invariant: TopEqual(a, b) == true.
+ Regexp* a2;
+ Regexp* b2;
+ switch (a->op()) {
+ default:
+ break;
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ for (int i = 0; i < a->nsub(); i++) {
+ a2 = a->sub()[i];
+ b2 = b->sub()[i];
+ if (!TopEqual(a2, b2))
+ return false;
+ stk.push_back(a2);
+ stk.push_back(b2);
+ }
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ a2 = a->sub()[0];
+ b2 = b->sub()[0];
+ if (!TopEqual(a2, b2))
+ return false;
+ // Really:
+ // stk.push_back(a2);
+ // stk.push_back(b2);
+ // break;
+ // but faster to assign directly and loop.
+ a = a2;
+ b = b2;
+ continue;
+ }
+
+ int n = stk.size();
+ if (n == 0)
+ break;
+
+ a = stk[n-2];
+ b = stk[n-1];
+ stk.resize(n-2);
+ }
+
+ return true;
+}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const string kErrorStrings[] = {
diff --git a/re2/regexp.h b/re2/regexp.h
index 7d0c5f5..d844c0c 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -149,7 +149,11 @@
// Matches character class given by cc_.
kRegexpCharClass,
- kMaxRegexpOp = kRegexpCharClass,
+ // Forces match of entire expression right now,
+ // with match ID match_id_ (used by RE2::Set).
+ kRegexpHaveMatch,
+
+ kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
@@ -250,7 +254,7 @@
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
-
+
friend class CharClassBuilder;
bool folds_ascii_;
@@ -260,39 +264,6 @@
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
};
-// Character class set: contains non-overlapping, non-abutting RuneRanges.
-typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
-
-class CharClassBuilder {
- public:
- CharClassBuilder();
-
- typedef RuneRangeSet::iterator iterator;
- iterator begin() { return ranges_.begin(); }
- iterator end() { return ranges_.end(); }
-
- int size() { return nrunes_; }
- bool empty() { return nrunes_ == 0; }
- bool full() { return nrunes_ == Runemax+1; }
-
- bool Contains(Rune r);
- bool FoldsASCII();
- bool AddRange(Rune lo, Rune hi); // returns whether class changed
- CharClassBuilder* Copy();
- void AddCharClass(CharClassBuilder* cc);
- void Negate();
- void RemoveAbove(Rune r);
- CharClass* GetCharClass();
-
- private:
- static const uint32 AlphaMask = (1<<26) - 1;
- uint32 upper_; // bitmap of A-Z
- uint32 lower_; // bitmap of a-z
- int nrunes_;
- RuneRangeSet ranges_;
- DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
-};
-
class Regexp {
public:
@@ -359,6 +330,7 @@
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+ int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
@@ -415,6 +387,10 @@
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+ static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+ // Like Alternate but does not factor out common prefixes.
+ static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
@@ -463,13 +439,46 @@
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
+ // Helper for testing [sic].
+ friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
- static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, ParseFlags flags);
+ static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+ ParseFlags flags, bool can_factor);
+
+ // Returns the leading string that re starts with.
+ // The returned Rune* points into a piece of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+ // Removes the first n leading runes from the beginning of re.
+ // Edits re in place.
+ static void RemoveLeadingString(Regexp* re, int n);
+
+ // Returns the leading regexp in re's top-level concatenation.
+ // The returned Regexp* points at re or a sub-expression of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Regexp* LeadingRegexp(Regexp* re);
+
+ // Removes LeadingRegexp(re) from re and returns the remainder.
+ // Might edit re in place.
+ static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+ // Simplifies an alternation of literal strings by factoring out
+ // common prefixes.
+ static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+ static int FactorAlternationRecursive(Regexp** sub, int nsub,
+ ParseFlags flags, int maxdepth);
+
+ // Is a == b? Only efficient on regexps that have not been through
+ // Simplify yet - the expansion of a kRegexpRepeat will make this
+ // take a long time. Do not call on such regexps, hence private.
+ static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
@@ -483,6 +492,9 @@
// Add Rune to LiteralString
void AddRuneToString(Rune r);
+ // Swaps this with that, in place.
+ void Swap(Regexp *that);
+
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
@@ -546,12 +558,47 @@
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
+ int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
};
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+ CharClassBuilder();
+
+ typedef RuneRangeSet::iterator iterator;
+ iterator begin() { return ranges_.begin(); }
+ iterator end() { return ranges_.end(); }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+
+ bool Contains(Rune r);
+ bool FoldsASCII();
+ bool AddRange(Rune lo, Rune hi); // returns whether class changed
+ CharClassBuilder* Copy();
+ void AddCharClass(CharClassBuilder* cc);
+ void Negate();
+ void RemoveAbove(Rune r);
+ CharClass* GetCharClass();
+ void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
+ static const uint32 AlphaMask = (1<<26) - 1;
+ uint32 upper_; // bitmap of A-Z
+ uint32 lower_; // bitmap of a-z
+ int nrunes_;
+ RuneRangeSet ranges_;
+ DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
+};
+
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
diff --git a/re2/set.cc b/re2/set.cc
index c392ee5..2bcd30a 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -31,11 +31,11 @@
return -1;
}
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
+
RegexpStatus status;
- re2::Regexp* re = Regexp::Parse(
- pattern,
- static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
- &status);
+ re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
@@ -44,18 +44,24 @@
return -1;
}
- re2::Regexp* sre = re->Simplify();
- re->Decref();
- re = sre;
- if(re == NULL) {
- if (error != NULL)
- *error = "simplification failed";
- if (options_.log_errors())
- LOG(ERROR) << "Error simplifying '" << pattern << "'";
- return -1;
- }
-
+ // Concatenate with match index and push on vector.
int n = re_.size();
+ re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
+ if (re->op() == kRegexpConcat) {
+ int nsub = re->nsub();
+ re2::Regexp** sub = new re2::Regexp*[nsub + 1];
+ for (int i = 0; i < nsub; i++)
+ sub[i] = re->sub()[i]->Incref();
+ sub[nsub] = m;
+ re->Decref();
+ re = re2::Regexp::Concat(sub, nsub + 1, pf);
+ delete[] sub;
+ } else {
+ re2::Regexp* sub[2];
+ sub[0] = re;
+ sub[1] = m;
+ re = re2::Regexp::Concat(sub, 2, pf);
+ }
re_.push_back(re);
return n;
}
@@ -66,7 +72,22 @@
return false;
}
compiled_ = true;
- prog_ = Prog::CompileSet(options_, anchor_, re_);
+
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
+ re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
+ re_.size(), pf);
+ re_.clear();
+ re2::Regexp* sre = re->Simplify();
+ re->Decref();
+ re = sre;
+ if (re == NULL) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Error simplifying during Compile.";
+ return false;
+ }
+
+ prog_ = Prog::CompileSet(options_, anchor_, re);
return prog_ != NULL;
}
@@ -79,6 +100,9 @@
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
+ if (failed)
+ LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
+
if (ret == false)
return false;
if (v->size() == 0) {
diff --git a/re2/simplify.cc b/re2/simplify.cc
index 2c9dbe3..faf3208 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc
@@ -54,6 +54,7 @@
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
+ case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
@@ -187,6 +188,7 @@
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
+ case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 9614983..8d92105 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -28,77 +28,77 @@
static Test tests[] = {
{ "a",
"1. byte [61-61] -> 2\n"
- "2. match!\n" },
+ "2. match! 0\n" },
{ "ab",
"1. byte [61-61] -> 2\n"
"2. byte [62-62] -> 3\n"
- "3. match!\n" },
+ "3. match! 0\n" },
{ "a|c",
"3. alt -> 1 | 2\n"
"1. byte [61-61] -> 4\n"
"2. byte [63-63] -> 4\n"
- "4. match!\n" },
+ "4. match! 0\n" },
{ "a|b",
"1. byte [61-62] -> 2\n"
- "2. match!\n" },
+ "2. match! 0\n" },
{ "[ab]",
"1. byte [61-62] -> 2\n"
- "2. match!\n" },
+ "2. match! 0\n" },
{ "a+",
"1. byte [61-61] -> 2\n"
"2. alt -> 1 | 3\n"
- "3. match!\n" },
+ "3. match! 0\n" },
{ "a+?",
"1. byte [61-61] -> 2\n"
"2. alt -> 3 | 1\n"
- "3. match!\n" },
+ "3. match! 0\n" },
{ "a*",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 2\n"
- "3. match!\n" },
+ "3. match! 0\n" },
{ "a*?",
"2. alt -> 3 | 1\n"
- "3. match!\n"
+ "3. match! 0\n"
"1. byte [61-61] -> 2\n" },
{ "a?",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 3\n"
- "3. match!\n" },
+ "3. match! 0\n" },
{ "a??",
"2. alt -> 3 | 1\n"
- "3. match!\n"
+ "3. match! 0\n"
"1. byte [61-61] -> 3\n" },
{ "a{4}",
"1. byte [61-61] -> 2\n"
"2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
- "5. match!\n" },
+ "5. match! 0\n" },
{ "(a)",
"2. capture 2 -> 1\n"
"1. byte [61-61] -> 3\n"
"3. capture 3 -> 4\n"
- "4. match!\n" },
+ "4. match! 0\n" },
{ "(?:a)",
"1. byte [61-61] -> 2\n"
- "2. match!\n" },
+ "2. match! 0\n" },
{ "",
- "2. match!\n" },
+ "2. match! 0\n" },
{ ".",
"3. alt -> 1 | 2\n"
"1. byte [00-09] -> 4\n"
"2. byte [0b-ff] -> 4\n"
- "4. match!\n" },
+ "4. match! 0\n" },
{ "[^ab]",
"5. alt -> 3 | 4\n"
"3. alt -> 1 | 2\n"
"4. byte [63-ff] -> 6\n"
"1. byte [00-09] -> 6\n"
"2. byte [0b-60] -> 6\n"
- "6. match!\n" },
+ "6. match! 0\n" },
{ "[Aa]",
"1. byte/i [61-61] -> 2\n"
- "2. match!\n" },
+ "2. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 27c85f5..4bdf714 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc
@@ -49,12 +49,13 @@
"bot",
"eot",
"cc",
+ "match",
};
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, string* s) {
- if (re->op() < 0 || re->op() > kMaxRegexpOp) {
+ if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
StringAppendF(s, "op%d", re->op());
} else {
switch (re->op()) {
@@ -88,6 +89,11 @@
switch (re->op()) {
default:
break;
+ case kRegexpEndText:
+ if (!(re->parse_flags() & Regexp::WasDollar)) {
+ s->append("\\z");
+ }
+ break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
@@ -124,12 +130,6 @@
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
- case kRegexpAnyChar:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- break;
case kRegexpCharClass: {
string sep;
for (CharClass::iterator it = re->cc()->begin();
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 7e0de62..106f35b 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -41,7 +41,8 @@
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
{ "", "emp{}" },
- { "|", "alt{emp{}emp{}}" },
+ { "|", "emp{}" }, // alt{emp{}emp{}} but got factored
+ { "|x|", "alt{emp{}lit{x}emp{}}" },
{ ".", "dot{}" },
{ "^", "bol{}" },
{ "$", "eol{}" },
@@ -111,9 +112,9 @@
{ "(?-m)^", "bot{}" },
{ "(?-m)$", "eot{}" },
{ "(?m)\\A", "bot{}" },
- { "(?m)\\z", "eot{}" },
+ { "(?m)\\z", "eot{\\z}" },
{ "(?-m)\\A", "bot{}" },
- { "(?-m)\\z", "eot{}" },
+ { "(?-m)\\z", "eot{\\z}" },
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
@@ -128,16 +129,33 @@
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | Regexp::PerlX | Regexp::PerlClasses;
-void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, const string& title) {
+bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
+ return Regexp::Equal(a, b);
+}
+
+void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
+ const string& title) {
+ Regexp** re = new Regexp*[ntests];
for (int i = 0; i < ntests; i++) {
RegexpStatus status;
- Regexp* re = Regexp::Parse(tests[i].regexp, flags, &status);
- CHECK(re != NULL) << " " << tests[i].regexp << " "
- << status.Text();
- string s = re->Dump();
- EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp;
- re->Decref();
+ re[i] = Regexp::Parse(tests[i].regexp, flags, &status);
+ CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
+ << status.Text();
+ string s = re[i]->Dump();
+ EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp;
}
+
+ for (int i = 0; i < ntests; i++) {
+ for (int j = 0; j < ntests; j++) {
+ EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
+ RegexpEqualTestingOnly(re[i], re[j]))
+ << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
+ }
+ }
+
+ for (int i = 0; i < ntests; i++)
+ re[i]->Decref();
+ delete[] re;
}
// Test that regexps parse to expected structures.
@@ -204,6 +222,13 @@
"cat{str{bc}cc{0x78-0x79}}}" },
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
+ { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
+ { "(?:xx|yy)c|(?:xx|yy)d",
+ "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
+ { "x{2}|x{2}[0-9]",
+ "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
+ { "x{2}y|x{2}[0-9]y",
+ "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
};
// Test that prefix factoring works.
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 4c569b0..ed79663 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -41,6 +41,37 @@
CHECK_EQ(v[0], 1);
}
+TEST(Set, UnanchoredFactored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ CHECK_EQ(s.Add("foo", NULL), 0);
+ CHECK_EQ(s.Add("(", NULL), -1);
+ CHECK_EQ(s.Add("foobar", NULL), 1);
+
+ CHECK_EQ(s.Compile(), true);
+
+ vector<int> v;
+ CHECK_EQ(s.Match("foobar", &v), true);
+ CHECK_EQ(v.size(), 2);
+ CHECK_EQ(v[0], 0);
+ CHECK_EQ(v[1], 1);
+
+ v.clear();
+ CHECK_EQ(s.Match("obarfoobaroo", &v), true);
+ CHECK_EQ(v.size(), 2);
+ CHECK_EQ(v[0], 0);
+ CHECK_EQ(v[1], 1);
+
+ v.clear();
+ CHECK_EQ(s.Match("fooba", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
+
+ v.clear();
+ CHECK_EQ(s.Match("oobar", &v), false);
+ CHECK_EQ(v.size(), 0);
+}
+
TEST(Set, Anchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index 6f6cfc5..ed75309 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -123,7 +123,7 @@
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{ "(a|b|)", "([a-b]|(?:))" },
- { "(|)", "((?:)|(?:))" },
+ { "(|)", "()" },
{ "a()", "a()" },
{ "(()|())", "(()|())" },
{ "(a|)", "(a|(?:))" },
diff --git a/re2/tostring.cc b/re2/tostring.cc
index a4aabbe..555524f 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc
@@ -75,6 +75,7 @@
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
+ case kRegexpHaveMatch:
nprec = PrecAtom;
break;
@@ -237,7 +238,10 @@
break;
case kRegexpEndText:
- t_->append("(?-m:$)");
+ if (re->parse_flags() & Regexp::WasDollar)
+ t_->append("(?-m:$)");
+ else
+ t_->append("\\z");
break;
case kRegexpWordBoundary:
@@ -272,6 +276,13 @@
case kRegexpCapture:
t_->append(")");
break;
+
+ case kRegexpHaveMatch:
+ // There's no syntax accepted by the parser to generate
+ // this node (it is generated by RE2::Set) so make something
+ // up that is readable but won't compile.
+ t_->append("(?HaveMatch:%d)", re->match_id());
+ break;
}
// If the parent is an alternation, append the | for it.