factor common prefixes from alternations Juggle code so that RE2::Set can benefit too. R=rsc CC=re2-dev http://codereview.appspot.com/1740045

commit: 4a9f4ca78a66843709737519472becb28c20e20c [log] [tgz]
author: Russ Cox <[email protected]> Thu Jul 15 20:38:05 2010 -0700
committer: Russ Cox <[email protected]> Thu Jul 15 20:38:05 2010 -0700
tree: 3400cc04094db3b906d65f4d7e1c89e7148c1420
parent: 4c213792cf4aa033242d3eb01841c4ba0bd805d7 [diff]
diff --git a/re2/compile.cc b/re2/compile.cc
index bbc4607..7b54c2d 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc

@@ -93,7 +93,7 @@
   }
 
   Prog::Inst* ip = &inst0[l.p>>1];
-  if(l.p&1)
+  if (l.p&1)
     ip->out1_ = l2.p;
   else
     ip->set_out(l2.p);
@@ -132,7 +132,7 @@
   // Compiles alternation of all the re to a new Prog.
   // Each re has a match with an id equal to its index in the vector.
   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                          const vector<Regexp*>& re);
+                          Regexp* re);
 
   // Interface for Regexp::Walker, which helps traverse the Regexp.
   // The walk is purely post-recursive: given the machines for the
@@ -207,7 +207,7 @@
   // Single rune.
   Frag Literal(Rune r, bool foldcase);
 
-  void Setup(Regexp::ParseFlags, int64);
+  void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
   Prog* Finish();
 
   // Returns .* where dot = any byte
@@ -230,6 +230,8 @@
   map<uint64, int> rune_cache_;
   Frag rune_range_;
 
+  RE2::Anchor anchor_;  // anchor mode for RE2::Set
+
   DISALLOW_EVIL_CONSTRUCTORS(Compiler);
 };
 
@@ -478,7 +480,8 @@
   rune_range_.end = nullPatchList;
 }
 
-int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
+int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
+                                     int next) {
   Frag f = ByteRange(lo, hi, foldcase);
   if (next != 0) {
     PatchList::Patch(inst_, f.end, next);
@@ -492,7 +495,9 @@
   // In Latin1 mode, there's no point in caching.
   // In forward UTF-8 mode, only need to cache continuation bytes.
   if (encoding_ == kEncodingLatin1 ||
-      (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf))) {
+      (encoding_ == kEncodingUTF8 &&
+       !reversed_ &&
+       !(0x80 <= lo && hi <= 0xbf))) {
     return UncachedRuneByteSuffix(lo, hi, foldcase, next);
   }
 
@@ -723,6 +728,14 @@
     case kRegexpEmptyMatch:
       return Nop();
 
+    case kRegexpHaveMatch: {
+      Frag f = Match(re->match_id());
+      // Remember unanchored match to end of string.
+      if (anchor_ != RE2::ANCHOR_BOTH)
+        f = Cat(DotStar(), f);
+      return f;
+    }
+
     case kRegexpConcat: {
       Frag f = child_frags[0];
       for (int i = 1; i < nchild_frags; i++)
@@ -896,7 +909,8 @@
   }
 }
 
-void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) {
+void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
+                     RE2::Anchor anchor) {
   prog_->set_flags(flags);
 
   if (flags & Regexp::Latin1)
@@ -926,6 +940,8 @@
 
     max_inst_ = m;
   }
+
+  anchor_ = anchor;
 }
 
 // Compiles re, returning program.
@@ -936,7 +952,7 @@
 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
   Compiler c;
 
-  c.Setup(re->parse_flags(), max_mem);
+  c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
   c.reversed_ = reversed;
 
   // Simplify to remove things like counted repetitions
@@ -1035,30 +1051,22 @@
 
 // Compiles RE set to Prog.
 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                           const vector<Regexp*>& re) {
+                           Regexp* re) {
   Compiler c;
 
-  c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()),
-          options.max_mem());
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
+  c.Setup(pf, options.max_mem(), anchor);
 
-  // Accumulate alternation of fragments.
-  Frag all = kNullFrag;
-  for (int i = 0; i < re.size(); i++) {
-    Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_);
-    if (c.failed_)
-      return NULL;
-    switch (anchor) {
-      case RE2::UNANCHORED:
-        f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar());
-        break;
-      case RE2::ANCHOR_START:
-        f = c.Cat(f, c.DotStar());
-        break;
-      case RE2::ANCHOR_BOTH:
-        break;
-    }
-    f = c.Cat(f, c.Match(i));
-    all = c.Alt(all, f);
+  // Compile alternation of fragments.
+  Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
+  re->Decref();
+  if (c.failed_)
+    return NULL;
+
+  if (anchor == RE2::UNANCHORED) {
+    // The trailing .* was added while handling kRegexpHaveMatch.
+    // We just have to add the leading one.
+    all = c.Cat(c.DotStar(), all);
   }
 
   c.prog_->set_start(all.begin);
@@ -1066,14 +1074,6 @@
   c.prog_->set_anchor_start(true);
   c.prog_->set_anchor_end(true);
 
-  // Also create unanchored version, which starts with a .*? loop.
-  if (c.prog_->anchor_start()) {
-    c.prog_->set_start_unanchored(c.prog_->start());
-  } else {
-    Frag unanchored = c.Cat(c.DotStar(), all);
-    c.prog_->set_start_unanchored(unanchored.begin);
-  }
-
   Prog* prog = c.Finish();
   if (prog == NULL)
     return NULL;
@@ -1093,7 +1093,7 @@
 }
 
 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                       const vector<Regexp*>& re) {
+                       Regexp* re) {
   return Compiler::CompileSet(options, anchor, re);
 }
 

diff --git a/re2/mimics_pcre.cc b/re2/mimics_pcre.cc
index c570168..fc6dd4a 100644
--- a/re2/mimics_pcre.cc
+++ b/re2/mimics_pcre.cc

@@ -151,6 +151,7 @@
     case kRegexpEndText:
     case kRegexpStar:                  // can always be empty
     case kRegexpQuest:
+    case kRegexpHaveMatch:
       return true;
 
     case kRegexpConcat:                // can be empty if all children can

diff --git a/re2/parse.cc b/re2/parse.cc
index 36b1c8f..eead3d8 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc

@@ -134,19 +134,6 @@
   bool ParsePerlFlags(StringPiece* s);
 
 
-  // Returns the leading string that re starts with.
-  // The returned Rune* points into a piece of re,
-  // so it must not be used after the caller calls re->Decref().
-  Rune* LeadingString(Regexp* re, int *nrune, Regexp::ParseFlags *flags);
-
-  // Removes the first n leading runes from the beginning of re.
-  void RemoveLeadingString(Regexp* re, int n);
-
-  // Simplifies an alternation of literal strings by factoring out
-  // common prefixes.
-  int SimplifyAlternation(Regexp** sub, int n, Regexp::ParseFlags altflags,
-                          int maxdepth);
-
   // Finishes the current concatenation,
   // collapsing it into a single regexp on the stack.
   void DoConcatenation();
@@ -176,11 +163,6 @@
 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
 
-// Add a range to the character class, but exclude newline if asked.
-// Also handle case folding.
-static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
-                     Regexp::ParseFlags parse_flags);
-
 Regexp::ParseState::ParseState(ParseFlags flags,
                                const StringPiece& whole_regexp,
                                RegexpStatus* status)
@@ -650,11 +632,56 @@
   return FinishRegexp(re);
 }
 
+// Returns the leading regexp that re starts with.
+// The returned Regexp* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Regexp* Regexp::LeadingRegexp(Regexp* re) {
+  if (re->op() == kRegexpEmptyMatch)
+    return NULL;
+  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+    Regexp** sub = re->sub();
+    if (sub[0]->op() == kRegexpEmptyMatch)
+      return NULL;
+    return sub[0];
+  }
+  return re;
+}
+
+// Removes LeadingRegexp(re) from re and returns what's left.
+// Consumes the reference to re and may edit it in place.
+// If caller wants to hold on to LeadingRegexp(re),
+// must have already Incref'ed it.
+Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
+  if (re->op() == kRegexpEmptyMatch)
+    return re;
+  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+    Regexp** sub = re->sub();
+    if (sub[0]->op() == kRegexpEmptyMatch)
+      return re;
+    sub[0]->Decref();
+    sub[0] = NULL;
+    if (re->nsub() == 2) {
+      // Collapse concatenation to single regexp.
+      Regexp* nre = sub[1];
+      sub[1] = NULL;
+      re->Decref();
+      return nre;
+    }
+    // 3 or more -> 2 or more.
+    re->nsub_--;
+    memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+    return re;
+  }
+  Regexp::ParseFlags pf = re->parse_flags();
+  re->Decref();
+  return new Regexp(kRegexpEmptyMatch, pf);
+}
+
 // Returns the leading string that re starts with.
 // The returned Rune* points into a piece of re,
 // so it must not be used after the caller calls re->Decref().
-Rune* Regexp::ParseState::LeadingString(Regexp* re, int *nrune,
-                                        Regexp::ParseFlags *flags) {
+Rune* Regexp::LeadingString(Regexp* re, int *nrune,
+                            Regexp::ParseFlags *flags) {
   while (re->op() == kRegexpConcat && re->nsub() > 0)
     re = re->sub()[0];
 
@@ -676,16 +703,25 @@
 
 // Removes the first n leading runes from the beginning of re.
 // Edits re in place.
-void Regexp::ParseState::RemoveLeadingString(Regexp* re, int n) {
-  while (re->op() == kRegexpConcat && re->nsub() > 0)
+void Regexp::RemoveLeadingString(Regexp* re, int n) {
+  // Chase down concats to find first string.
+  // For regexps generated by parser, nested concats are
+  // flattened except when doing so would overflow the 16-bit
+  // limit on the size of a concatenation, so we should never
+  // see more than two here.
+  Regexp* stk[4];
+  int d = 0;
+  while (re->op() == kRegexpConcat) {
+    if (d < arraysize(stk))
+      stk[d++] = re;
     re = re->sub()[0];
-
-  if (re->op() == kRegexpLiteral) {
-    re->op_ = kRegexpEmptyMatch;
-    return;
   }
 
-  if (re->op() == kRegexpLiteralString) {
+  // Remove leading string from re.
+  if (re->op() == kRegexpLiteral) {
+    re->rune_ = 0;
+    re->op_ = kRegexpEmptyMatch;
+  } else if (re->op() == kRegexpLiteralString) {
     if (n >= re->nrunes_) {
       delete[] re->runes_;
       re->runes_ = NULL;
@@ -698,17 +734,50 @@
       re->nrunes_ = 0;
       re->rune_ = rune;
       re->op_ = kRegexpLiteral;
-
     } else {
       re->nrunes_ -= n;
       memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
     }
-    return;
+  }
+
+  // If re is now empty, concatenations might simplify too.
+  while (d-- > 0) {
+    re = stk[d];
+    Regexp** sub = re->sub();
+    if (sub[0]->op() == kRegexpEmptyMatch) {
+      sub[0]->Decref();
+      sub[0] = NULL;
+      // Delete first element of concat.
+      switch (re->nsub()) {
+        case 0:
+        case 1:
+          // Impossible.
+          LOG(DFATAL) << "Concat of " << re->nsub();
+          re->submany_ = NULL;
+          re->op_ = kRegexpEmptyMatch;
+          break;
+
+        case 2: {
+          // Replace re with sub[1].
+          Regexp* old = sub[1];
+          sub[1] = NULL;
+          re->Swap(old);
+          old->Decref();
+          break;
+        }
+
+        default:
+          // Slide down.
+          re->nsub_--;
+          memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+          break;
+      }
+    }
   }
 }
 
-// Simplifies an alternation of literal strings by factoring out
-// common prefixes.  For example,
+// Factors common prefixes from alternation.
+// For example,
 //     ABC|ABD|AEF|BCX|BCY
 // simplifies to
 //     A(B(C|D)|EF)|BC(X|Y)
@@ -719,15 +788,22 @@
 // the new length of sub.  Adjusts reference counts accordingly
 // (incoming sub[i] decremented, outgoing sub[i] incremented).
 
-// It's too much of a pain to explicitly write out the recursion,
+// It's too much of a pain to write this code with an explicit stack,
 // so instead we let the caller specify a maximum depth and
 // don't simplify beyond that.  There are around 15 words of local
 // variables and parameters in the frame, so allowing 8 levels
 // on a 64-bit machine is still less than a kilobyte of stack and
 // probably enough benefit for practical uses.
-const int kSimplifyAlternationMaxDepth = 8;
+const int kFactorAlternationMaxDepth = 8;
 
-int Regexp::ParseState::SimplifyAlternation(
+int Regexp::FactorAlternation(
+    Regexp** sub, int n,
+    Regexp::ParseFlags altflags) {
+  return FactorAlternationRecursive(sub, n, altflags,
+                                    kFactorAlternationMaxDepth);
+}
+
+int Regexp::FactorAlternationRecursive(
     Regexp** sub, int n,
     Regexp::ParseFlags altflags,
     int maxdepth) {
@@ -735,7 +811,7 @@
   if (maxdepth <= 0)
     return n;
 
-  // Round 1: Factor out common prefixes.
+  // Round 1: Factor out common literal prefixes.
   Rune *rune = NULL;
   int nrune = 0;
   Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
@@ -781,9 +857,9 @@
       x[0] = LiteralString(rune, nrune, runeflags);
       for (int j = start; j < i; j++)
         RemoveLeadingString(sub[j], nrune);
-      int nn = SimplifyAlternation(sub + start, i - start, altflags,
-                                   maxdepth - 1);
-      x[1] = Alternate(sub + start, nn, altflags);
+      int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
+                                          maxdepth - 1);
+      x[1] = AlternateNoFactor(sub + start, nn, altflags);
       sub[out++] = Concat(x, 2, altflags);
     }
 
@@ -797,7 +873,56 @@
   }
   n = out;
 
-  // Round 2: Collapse runs of single literals into character classes.
+  // Round 2: Factor out common complex prefixes,
+  // just the first piece of each concatenation,
+  // whatever it is.  This is good enough a lot of the time.
+  start = 0;
+  out = 0;
+  Regexp* first = NULL;
+  for (int i = 0; i <= n; i++) {
+    // Invariant: what was in sub[0:start] has been Decref'ed
+    // and that space has been reused for sub[0:out] (out <= start).
+    //
+    // Invariant: sub[start:i] consists of regexps that all begin with first.
+
+    Regexp* first_i = NULL;
+    if (i < n) {
+      first_i = LeadingRegexp(sub[i]);
+      if (first != NULL && Regexp::Equal(first, first_i)) {
+        continue;
+      }
+    }
+
+    // Found end of a run with common leading regexp:
+    // sub[start:i] all begin with first but sub[i] does not.
+    //
+    // Factor out common regexp and append factored expression to sub[0:out].
+    if (i == start) {
+      // Nothing to do - first iteration.
+    } else if (i == start+1) {
+      // Just one: don't bother factoring.
+      sub[out++] = sub[start];
+    } else {
+      // Construct factored form: prefix(suffix1|suffix2|...)
+      Regexp* x[2];  // x[0] = prefix, x[1] = suffix1|suffix2|...
+      x[0] = first->Incref();
+      for (int j = start; j < i; j++)
+        sub[j] = RemoveLeadingRegexp(sub[j]);
+      int nn = FactorAlternationRecursive(sub + start, i - start, altflags,
+                                   maxdepth - 1);
+      x[1] = AlternateNoFactor(sub + start, nn, altflags);
+      sub[out++] = Concat(x, 2, altflags);
+    }
+
+    // Prepare for next round (if there is one).
+    if (i < n) {
+      start = i;
+      first = first_i;
+    }
+  }
+  n = out;
+
+  // Round 3: Collapse runs of single literals into character classes.
   start = 0;
   out = 0;
   for (int i = 0; i <= n; i++) {
@@ -817,7 +942,6 @@
     if (i == start) {
       // Nothing to do.
     } else if (i == start+1) {
-      // Char class (or not) of one.
       sub[out++] = sub[start];
     } else {
       // Make new char class.
@@ -829,7 +953,7 @@
           for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
             ccb.AddRange(it->lo, it->hi);
         } else if (re->op() == kRegexpLiteral) {
-          AddRange(&ccb, re->rune(), re->rune(), re->parse_flags());
+          ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
         } else {
           LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
                       << re->ToString();
@@ -846,6 +970,20 @@
   }
   n = out;
 
+  // Round 4: Collapse runs of empty matches into single empty match.
+  start = 0;
+  out = 0;
+  for (int i = 0; i < n; i++) {
+    if (i + 1 < n &&
+        sub[i]->op() == kRegexpEmptyMatch &&
+        sub[i+1]->op() == kRegexpEmptyMatch) {
+      sub[i]->Decref();
+      continue;
+    }
+    sub[out++] = sub[i];
+  }
+  n = out;
+
   return n;
 }
 
@@ -886,10 +1024,7 @@
     }
   }
 
-  if (op == kRegexpAlternate)
-    n = SimplifyAlternation(subs, n, flags_, kSimplifyAlternationMaxDepth);
-
-  Regexp* re = ConcatOrAlternate(op, subs, n, flags_);
+  Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true);
   delete[] subs;
   re->simple_ = re->ComputeSimple();
   re->down_ = next;
@@ -1241,25 +1376,25 @@
 
 // Add a range to the character class, but exclude newline if asked.
 // Also handle case folding.
-static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
-                     Regexp::ParseFlags parse_flags) {
+void CharClassBuilder::AddRangeFlags(
+    Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
 
   // Take out \n if the flags say so.
   bool cutnl = !(parse_flags & Regexp::ClassNL) ||
                (parse_flags & Regexp::NeverNL);
   if (cutnl && lo <= '\n' && '\n' <= hi) {
     if (lo < '\n')
-      AddRange(cc, lo, '\n' - 1, parse_flags);
+      AddRangeFlags(lo, '\n' - 1, parse_flags);
     if (hi > '\n')
-      AddRange(cc, '\n' + 1, hi, parse_flags);
+      AddRangeFlags('\n' + 1, hi, parse_flags);
     return;
   }
 
   // If folding case, add fold-equivalent characters too.
   if (parse_flags & Regexp::FoldCase)
-    AddFoldedRange(cc, lo, hi, 0);
+    AddFoldedRange(this, lo, hi, 0);
   else
-    cc->AddRange(lo, hi);
+    AddRange(lo, hi);
 }
 
 // Look for a group with the given name.
@@ -1298,10 +1433,10 @@
 static void AddUGroup(CharClassBuilder *cc, UGroup *g,
                       Regexp::ParseFlags parse_flags) {
   for (int i = 0; i < g->nr16; i++) {
-    AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags);
+    cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
   }
   for (int i = 0; i < g->nr32; i++) {
-    AddRange(cc, g->r32[i].lo, g->r32[i].hi, parse_flags);
+    cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
   }
 }
 
@@ -1311,16 +1446,16 @@
   int next = 0;
   for (int i = 0; i < g->nr16; i++) {
     if (next < g->r16[i].lo)
-      AddRange(cc, next, g->r16[i].lo - 1, parse_flags);
+      cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
     next = g->r16[i].hi + 1;
   }
   for (int i = 0; i < g->nr32; i++) {
     if (next < g->r32[i].lo)
-      AddRange(cc, next, g->r32[i].lo - 1, parse_flags);
+      cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
     next = g->r32[i].hi + 1;
   }
   if (next <= Runemax)
-    AddRange(cc, next, Runemax, parse_flags);
+    cc->AddRangeFlags(next, Runemax, parse_flags);
 }
 
 // Maybe parse a Perl character class escape sequence.
@@ -1588,12 +1723,12 @@
       re->Decref();
       return false;
     }
-    // AddRange is usually called in response to a class like
+    // AddRangeFlags is usually called in response to a class like
     // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
     // Regexp::ClassNL is set.  In an explicit range or singleton
     // like we just parsed, we do not filter \n out, so set ClassNL
     // in the flags.
-    AddRange(re->ccb_, rr.lo, rr.hi, flags_ | Regexp::ClassNL);
+    re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
   }
   if (s->size() == 0) {
     status->set_code(kRegexpMissingBracket);

diff --git a/re2/prog.cc b/re2/prog.cc
index 897a50a..ef9ef23 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc

@@ -80,7 +80,7 @@
                           static_cast<int>(empty_), out());
 
     case kInstMatch:
-      return StringPrintf("match!");
+      return StringPrintf("match! %d", match_id());
 
     case kInstNop:
       return StringPrintf("nop -> %d", out());

diff --git a/re2/prog.h b/re2/prog.h
index daeafe4..bbebfdf 100644
--- a/re2/prog.h
+++ b/re2/prog.h

@@ -332,7 +332,7 @@
   // Compiles a collection of regexps to Prog.  Each regexp will have
   // its own Match instruction recording the index in the vector.
   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                          const vector<Regexp*>& re);
+                          Regexp* re);
 
  private:
   friend class Compiler;

diff --git a/re2/regexp.cc b/re2/regexp.cc
index e988f8a..6ecf834 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc

@@ -126,6 +126,8 @@
       Regexp** subs = re->sub();
       for (int i = 0; i < re->nsub_; i++) {
         Regexp* sub = subs[i];
+        if (sub == NULL)
+          continue;
         if (sub->ref_ == kMaxRef)
           sub->Decref();
         else
@@ -160,8 +162,13 @@
   runes_[nrunes_++] = r;
 }
 
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+  re->match_id_ = match_id;
+  return re;
+}
 
-Regexp* Regexp::Plus(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
   if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
     return sub;
   Regexp* re = new Regexp(kRegexpPlus, flags);
@@ -170,7 +177,7 @@
   return re;
 }
 
-Regexp* Regexp::Star(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
   if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
     return sub;
   Regexp* re = new Regexp(kRegexpStar, flags);
@@ -179,7 +186,7 @@
   return re;
 }
 
-Regexp* Regexp::Quest(Regexp* sub, Regexp::ParseFlags flags) {
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
   if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
     return sub;
   Regexp* re = new Regexp(kRegexpQuest, flags);
@@ -188,10 +195,25 @@
   return re;
 }
 
-Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, ParseFlags flags) {
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+                                  ParseFlags flags, bool can_factor) {
   if (nsub == 1)
     return sub[0];
 
+  Regexp** subcopy = NULL;
+  if (op == kRegexpAlternate && can_factor) {
+    // Going to edit sub; make a copy so we don't step on caller.
+    subcopy = new Regexp*[nsub];
+    memmove(subcopy, sub, nsub * sizeof sub[0]);
+    sub = subcopy;
+    nsub = FactorAlternation(sub, nsub, flags);
+    if (nsub == 1) {
+      Regexp* re = sub[0];
+      delete[] subcopy;
+      return re;
+    }
+  }
+
   if (nsub > kMaxNsub) {
     // Too many subexpressions to fit in a single Regexp.
     // Make a two-level tree.  Two levels gets us to 65535^2.
@@ -200,8 +222,11 @@
     re->AllocSub(nbigsub);
     Regexp** subs = re->sub();
     for (int i = 0; i < nbigsub - 1; i++)
-      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags);
-    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags);
+      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+                                          nsub - (nbigsub-1)*kMaxNsub, flags,
+                                          false);
+    delete[] subcopy;
     return re;
   }
 
@@ -210,15 +235,21 @@
   Regexp** subs = re->sub();
   for (int i = 0; i < nsub; i++)
     subs[i] = sub[i];
+
+  delete[] subcopy;
   return re;
 }
 
 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
-  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags);
+  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
 }
 
 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
-  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags);
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
 }
 
 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
@@ -261,6 +292,159 @@
   return re;
 }
 
+// Swaps this and that in place.
+void Regexp::Swap(Regexp* that) {
+  // Can use memmove because Regexp is just a struct (no vtable).
+  char tmp[sizeof *this];
+  memmove(tmp, this, sizeof tmp);
+  memmove(this, that, sizeof tmp);
+  memmove(that, tmp, sizeof tmp);
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+  if (a->op() != b->op())
+    return false;
+
+  switch (a->op()) {
+    case kRegexpNoMatch:
+    case kRegexpEmptyMatch:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpBeginLine:
+    case kRegexpEndLine:
+    case kRegexpWordBoundary:
+    case kRegexpNoWordBoundary:
+    case kRegexpBeginText:
+      return true;
+
+    case kRegexpEndText:
+      // The parse flags remember whether it's \z or (?-m:$),
+      // which matters when testing against PCRE.
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+    case kRegexpLiteral:
+      return a->rune() == b->rune() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+    case kRegexpLiteralString:
+      return a->nrunes() == b->nrunes() &&
+             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+             memcmp(a->runes(), b->runes(),
+                    a->nrunes() * sizeof a->runes()[0]) == 0;
+
+    case kRegexpAlternate:
+    case kRegexpConcat:
+      return a->nsub() == b->nsub();
+
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+    case kRegexpRepeat:
+      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+             a->min() == b->min() &&
+             a->max() == b->max();
+
+    case kRegexpCapture:
+      return a->cap() == b->cap() && a->name() == b->name();
+
+    case kRegexpHaveMatch:
+      return a->match_id() == b->match_id();
+
+    case kRegexpCharClass: {
+      CharClass* acc = a->cc();
+      CharClass* bcc = b->cc();
+      return acc->size() == bcc->size() &&
+             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+             memcmp(acc->begin(), bcc->begin(),
+                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+    }
+  }
+
+  LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+  return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+  if (a == NULL || b == NULL)
+    return a == b;
+
+  if (!TopEqual(a, b))
+    return false;
+
+  // Fast path:
+  // return without allocating vector if there are no subregexps.
+  switch (a->op()) {
+    case kRegexpAlternate:
+    case kRegexpConcat:
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+    case kRegexpRepeat:
+    case kRegexpCapture:
+      break;
+
+    default:
+      return true;
+  }
+
+  // Committed to doing real work.
+  // The stack (vector) has pairs of regexps waiting to
+  // be compared.  The regexps are only equal if
+  // all the pairs end up being equal.
+  vector<Regexp*> stk;
+
+  for (;;) {
+    // Invariant: TopEqual(a, b) == true.
+    Regexp* a2;
+    Regexp* b2;
+    switch (a->op()) {
+      default:
+        break;
+      case kRegexpAlternate:
+      case kRegexpConcat:
+        for (int i = 0; i < a->nsub(); i++) {
+          a2 = a->sub()[i];
+          b2 = b->sub()[i];
+          if (!TopEqual(a2, b2))
+            return false;
+          stk.push_back(a2);
+          stk.push_back(b2);
+        }
+        break;
+
+      case kRegexpStar:
+      case kRegexpPlus:
+      case kRegexpQuest:
+      case kRegexpRepeat:
+      case kRegexpCapture:
+        a2 = a->sub()[0];
+        b2 = b->sub()[0];
+        if (!TopEqual(a2, b2))
+          return false;
+        // Really:
+        //   stk.push_back(a2);
+        //   stk.push_back(b2);
+        //   break;
+        // but faster to assign directly and loop.
+        a = a2;
+        b = b2;
+        continue;
+    }
+
+    int n = stk.size();
+    if (n == 0)
+      break;
+
+    a = stk[n-2];
+    b = stk[n-1];
+    stk.resize(n-2);
+  }
+
+  return true;
+}
 
 // Keep in sync with enum RegexpStatusCode in regexp.h
 static const string kErrorStrings[] = {

diff --git a/re2/regexp.h b/re2/regexp.h
index 7d0c5f5..d844c0c 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h

@@ -149,7 +149,11 @@
   // Matches character class given by cc_.
   kRegexpCharClass,
 
-  kMaxRegexpOp = kRegexpCharClass,
+  // Forces match of entire expression right now,
+  // with match ID match_id_ (used by RE2::Set).
+  kRegexpHaveMatch,
+
+  kMaxRegexpOp = kRegexpHaveMatch,
 };
 
 // Keep in sync with string list in regexp.cc
@@ -250,7 +254,7 @@
   CharClass();  // not implemented
   ~CharClass();  // not implemented
   static CharClass* New(int maxranges);
-  
+
   friend class CharClassBuilder;
 
   bool folds_ascii_;
@@ -260,39 +264,6 @@
   DISALLOW_EVIL_CONSTRUCTORS(CharClass);
 };
 
-// Character class set: contains non-overlapping, non-abutting RuneRanges.
-typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
-
-class CharClassBuilder {
- public:
-  CharClassBuilder();
-
-  typedef RuneRangeSet::iterator iterator;
-  iterator begin() { return ranges_.begin(); }
-  iterator end() { return ranges_.end(); }
-
-  int size() { return nrunes_; }
-  bool empty() { return nrunes_ == 0; }
-  bool full() { return nrunes_ == Runemax+1; }
-
-  bool Contains(Rune r);
-  bool FoldsASCII();
-  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
-  CharClassBuilder* Copy();
-  void AddCharClass(CharClassBuilder* cc);
-  void Negate();
-  void RemoveAbove(Rune r);
-  CharClass* GetCharClass();
-
- private:
-  static const uint32 AlphaMask = (1<<26) - 1;
-  uint32 upper_;  // bitmap of A-Z
-  uint32 lower_;  // bitmap of a-z
-  int nrunes_;
-  RuneRangeSet ranges_;
-  DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
-};
-
 class Regexp {
  public:
 
@@ -359,6 +330,7 @@
   const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
   Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
   int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
 
   // Increments reference count, returns object as convenience.
   Regexp* Incref();
@@ -415,6 +387,10 @@
   static Regexp* NewLiteral(Rune rune, ParseFlags flags);
   static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
   static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+  static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+  // Like Alternate but does not factor out common prefixes.
+  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
 
   // Debugging function.  Returns string format for regexp
   // that makes structure clear.  Does NOT use regexp syntax.
@@ -463,13 +439,46 @@
   friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
                              RegexpStatus* status);
 
+  // Helper for testing [sic].
+  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
   // Computes whether Regexp is already simple.
   bool ComputeSimple();
 
   // Constructor that generates a concatenation or alternation,
   // enforcing the limit on the number of subexpressions for
   // a particular Regexp.
-  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, ParseFlags flags);
+  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+                                   ParseFlags flags, bool can_factor);
+
+  // Returns the leading string that re starts with.
+  // The returned Rune* points into a piece of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+  // Removes the first n leading runes from the beginning of re.
+  // Edits re in place.
+  static void RemoveLeadingString(Regexp* re, int n);
+
+  // Returns the leading regexp in re's top-level concatenation.
+  // The returned Regexp* points at re or a sub-expression of re,
+  // so it must not be used after the caller calls re->Decref().
+  static Regexp* LeadingRegexp(Regexp* re);
+
+  // Removes LeadingRegexp(re) from re and returns the remainder.
+  // Might edit re in place.
+  static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+  // Simplifies an alternation of literal strings by factoring out
+  // common prefixes.
+  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+  static int FactorAlternationRecursive(Regexp** sub, int nsub,
+                                        ParseFlags flags, int maxdepth);
+
+  // Is a == b?  Only efficient on regexps that have not been through
+  // Simplify yet - the expansion of a kRegexpRepeat will make this
+  // take a long time.  Do not call on such regexps, hence private.
+  static bool Equal(Regexp* a, Regexp* b);
 
   // Allocate space for n sub-regexps.
   void AllocSub(int n) {
@@ -483,6 +492,9 @@
   // Add Rune to LiteralString
   void AddRuneToString(Rune r);
 
+  // Swaps this with that, in place.
+  void Swap(Regexp *that);
+
   // Operator.  See description of operators above.
   // uint8 instead of RegexpOp to control space usage.
   uint8 op_;
@@ -546,12 +558,47 @@
       CharClassBuilder* ccb_;
     };
     Rune rune_;  // Literal
+    int match_id_;  // HaveMatch
     void *the_union_[2];  // as big as any other element, for memset
   };
 
   DISALLOW_EVIL_CONSTRUCTORS(Regexp);
 };
 
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+  CharClassBuilder();
+
+  typedef RuneRangeSet::iterator iterator;
+  iterator begin() { return ranges_.begin(); }
+  iterator end() { return ranges_.end(); }
+
+  int size() { return nrunes_; }
+  bool empty() { return nrunes_ == 0; }
+  bool full() { return nrunes_ == Runemax+1; }
+
+  bool Contains(Rune r);
+  bool FoldsASCII();
+  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
+  CharClassBuilder* Copy();
+  void AddCharClass(CharClassBuilder* cc);
+  void Negate();
+  void RemoveAbove(Rune r);
+  CharClass* GetCharClass();
+  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
+  static const uint32 AlphaMask = (1<<26) - 1;
+  uint32 upper_;  // bitmap of A-Z
+  uint32 lower_;  // bitmap of a-z
+  int nrunes_;
+  RuneRangeSet ranges_;
+  DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
+};
+
 // Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
 {

diff --git a/re2/set.cc b/re2/set.cc
index c392ee5..2bcd30a 100644
--- a/re2/set.cc
+++ b/re2/set.cc

@@ -31,11 +31,11 @@
     return -1;
   }
 
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+    options_.ParseFlags());
+
   RegexpStatus status;
-  re2::Regexp* re = Regexp::Parse(
-    pattern,
-    static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
-    &status);
+  re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
   if (re == NULL) {
     if (error != NULL)
       *error = status.Text();
@@ -44,18 +44,24 @@
     return -1;
   }
 
-  re2::Regexp* sre = re->Simplify();
-  re->Decref();
-  re = sre;
-  if(re == NULL) {
-    if (error != NULL)
-      *error = "simplification failed";
-    if (options_.log_errors())
-      LOG(ERROR) << "Error simplifying '" << pattern << "'";
-    return -1;
-  }
-
+  // Concatenate with match index and push on vector.
   int n = re_.size();
+  re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
+  if (re->op() == kRegexpConcat) {
+    int nsub = re->nsub();
+    re2::Regexp** sub = new re2::Regexp*[nsub + 1];
+    for (int i = 0; i < nsub; i++)
+      sub[i] = re->sub()[i]->Incref();
+    sub[nsub] = m;
+    re->Decref();
+    re = re2::Regexp::Concat(sub, nsub + 1, pf);
+    delete[] sub;
+  } else {
+    re2::Regexp* sub[2];
+    sub[0] = re;
+    sub[1] = m;
+    re = re2::Regexp::Concat(sub, 2, pf);
+  }
   re_.push_back(re);
   return n;
 }
@@ -66,7 +72,22 @@
     return false;
   }
   compiled_ = true;
-  prog_ = Prog::CompileSet(options_, anchor_, re_);
+
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+    options_.ParseFlags());
+  re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
+                                           re_.size(), pf);
+  re_.clear();
+  re2::Regexp* sre = re->Simplify();
+  re->Decref();
+  re = sre;
+  if (re == NULL) {
+    if (options_.log_errors())
+      LOG(ERROR) << "Error simplifying during Compile.";
+    return false;
+  }
+
+  prog_ = Prog::CompileSet(options_, anchor_, re);
   return prog_ != NULL;
 }
 
@@ -79,6 +100,9 @@
   bool failed;
   bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
                               Prog::kManyMatch, NULL, &failed, v);
+  if (failed)
+    LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
+
   if (ret == false)
     return false;
   if (v->size() == 0) {

diff --git a/re2/simplify.cc b/re2/simplify.cc
index 2c9dbe3..faf3208 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc

@@ -54,6 +54,7 @@
     case kRegexpEndText:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
+    case kRegexpHaveMatch:
       return true;
     case kRegexpConcat:
     case kRegexpAlternate:
@@ -187,6 +188,7 @@
     case kRegexpEndText:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
+    case kRegexpHaveMatch:
       // All these are always simple.
       re->simple_ = true;
       return re->Incref();

diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 9614983..8d92105 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc

@@ -28,77 +28,77 @@
 static Test tests[] = {
   { "a",
     "1. byte [61-61] -> 2\n"
-    "2. match!\n" },
+    "2. match! 0\n" },
   { "ab",
     "1. byte [61-61] -> 2\n"
     "2. byte [62-62] -> 3\n"
-    "3. match!\n" },
+    "3. match! 0\n" },
   { "a|c",
     "3. alt -> 1 | 2\n"
     "1. byte [61-61] -> 4\n"
     "2. byte [63-63] -> 4\n"
-    "4. match!\n" },
+    "4. match! 0\n" },
   { "a|b",
     "1. byte [61-62] -> 2\n"
-    "2. match!\n" },
+    "2. match! 0\n" },
   { "[ab]",
     "1. byte [61-62] -> 2\n"
-    "2. match!\n" },
+    "2. match! 0\n" },
   { "a+",
     "1. byte [61-61] -> 2\n"
     "2. alt -> 1 | 3\n"
-    "3. match!\n" },
+    "3. match! 0\n" },
   { "a+?",
     "1. byte [61-61] -> 2\n"
     "2. alt -> 3 | 1\n"
-    "3. match!\n" },
+    "3. match! 0\n" },
   { "a*",
     "2. alt -> 1 | 3\n"
     "1. byte [61-61] -> 2\n"
-    "3. match!\n" },
+    "3. match! 0\n" },
   { "a*?",
     "2. alt -> 3 | 1\n"
-    "3. match!\n"
+    "3. match! 0\n"
     "1. byte [61-61] -> 2\n" },
   { "a?",
     "2. alt -> 1 | 3\n"
     "1. byte [61-61] -> 3\n"
-    "3. match!\n" },
+    "3. match! 0\n" },
   { "a??",
     "2. alt -> 3 | 1\n"
-    "3. match!\n"
+    "3. match! 0\n"
     "1. byte [61-61] -> 3\n" },
   { "a{4}",
     "1. byte [61-61] -> 2\n"
     "2. byte [61-61] -> 3\n"
     "3. byte [61-61] -> 4\n"
     "4. byte [61-61] -> 5\n"
-    "5. match!\n" },
+    "5. match! 0\n" },
   { "(a)",
     "2. capture 2 -> 1\n"
     "1. byte [61-61] -> 3\n"
     "3. capture 3 -> 4\n"
-    "4. match!\n" },
+    "4. match! 0\n" },
   { "(?:a)",
     "1. byte [61-61] -> 2\n"
-    "2. match!\n" },
+    "2. match! 0\n" },
   { "",
-    "2. match!\n" },
+    "2. match! 0\n" },
   { ".",
     "3. alt -> 1 | 2\n"
     "1. byte [00-09] -> 4\n"
     "2. byte [0b-ff] -> 4\n"
-    "4. match!\n" },
+    "4. match! 0\n" },
   { "[^ab]",
     "5. alt -> 3 | 4\n"
     "3. alt -> 1 | 2\n"
     "4. byte [63-ff] -> 6\n"
     "1. byte [00-09] -> 6\n"
     "2. byte [0b-60] -> 6\n"
-    "6. match!\n" },
+    "6. match! 0\n" },
   { "[Aa]",
     "1. byte/i [61-61] -> 2\n"
-    "2. match!\n" },
+    "2. match! 0\n" },
 };
 
 TEST(TestRegexpCompileToProg, Simple) {

diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 27c85f5..4bdf714 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc

@@ -49,12 +49,13 @@
   "bot",
   "eot",
   "cc",
+  "match",
 };
 
 // Create string representation of regexp with explicit structure.
 // Nothing pretty, just for testing.
 static void DumpRegexpAppending(Regexp* re, string* s) {
-  if (re->op() < 0 || re->op() > kMaxRegexpOp) {
+  if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
     StringAppendF(s, "op%d", re->op());
   } else {
     switch (re->op()) {
@@ -88,6 +89,11 @@
   switch (re->op()) {
     default:
       break;
+    case kRegexpEndText:
+      if (!(re->parse_flags() & Regexp::WasDollar)) {
+        s->append("\\z");
+      }
+      break;
     case kRegexpLiteral: {
       Rune r = re->rune();
       char buf[UTFmax+1];
@@ -124,12 +130,6 @@
       s->append(StringPrintf("%d,%d ", re->min(), re->max()));
       DumpRegexpAppending(re->sub()[0], s);
       break;
-    case kRegexpAnyChar:
-    case kRegexpBeginLine:
-    case kRegexpEndLine:
-    case kRegexpWordBoundary:
-    case kRegexpNoWordBoundary:
-      break;
     case kRegexpCharClass: {
       string sep;
       for (CharClass::iterator it = re->cc()->begin();

diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 7e0de62..106f35b 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc

@@ -41,7 +41,8 @@
   { "a{2,3}?", "nrep{2,3 lit{a}}" },
   { "a{2,}?", "nrep{2,-1 lit{a}}" },
   { "", "emp{}" },
-  { "|", "alt{emp{}emp{}}" },
+  { "|", "emp{}" },  // alt{emp{}emp{}} but got factored
+  { "|x|", "alt{emp{}lit{x}emp{}}" },
   { ".", "dot{}" },
   { "^", "bol{}" },
   { "$", "eol{}" },
@@ -111,9 +112,9 @@
   { "(?-m)^", "bot{}" },
   { "(?-m)$", "eot{}" },
   { "(?m)\\A", "bot{}" },
-  { "(?m)\\z", "eot{}" },
+  { "(?m)\\z", "eot{\\z}" },
   { "(?-m)\\A", "bot{}" },
-  { "(?-m)\\z", "eot{}" },
+  { "(?-m)\\z", "eot{\\z}" },
 
   // Test named captures
   { "(?P<name>a)", "cap{name:lit{a}}" },
@@ -128,16 +129,33 @@
 
 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | Regexp::PerlX | Regexp::PerlClasses;
 
-void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, const string& title) {
+bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
+  return Regexp::Equal(a, b);
+}
+
+void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
+               const string& title) {
+  Regexp** re = new Regexp*[ntests];
   for (int i = 0; i < ntests; i++) {
     RegexpStatus status;
-    Regexp* re = Regexp::Parse(tests[i].regexp, flags, &status);
-    CHECK(re != NULL) << " " << tests[i].regexp << " "
-                      << status.Text();
-    string s = re->Dump();
-    EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp;
-    re->Decref();
+    re[i] = Regexp::Parse(tests[i].regexp, flags, &status);
+    CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
+                         << status.Text();
+    string s = re[i]->Dump();
+    EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp;
   }
+
+  for (int i = 0; i < ntests; i++) {
+    for (int j = 0; j < ntests; j++) {
+      EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
+                RegexpEqualTestingOnly(re[i], re[j]))
+        << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
+    }
+  }
+
+  for (int i = 0; i < ntests; i++)
+    re[i]->Decref();
+  delete[] re;
 }
 
 // Test that regexps parse to expected structures.
@@ -204,6 +222,13 @@
       "cat{str{bc}cc{0x78-0x79}}}" },
   { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
   { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
+  { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
+  { "(?:xx|yy)c|(?:xx|yy)d",
+    "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
+  { "x{2}|x{2}[0-9]",
+    "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
+  { "x{2}y|x{2}[0-9]y",
+    "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
 };
 
 // Test that prefix factoring works.

diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 4c569b0..ed79663 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc

@@ -41,6 +41,37 @@
   CHECK_EQ(v[0], 1);
 }
 
+TEST(Set, UnanchoredFactored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+  CHECK_EQ(s.Add("foo", NULL), 0);
+  CHECK_EQ(s.Add("(", NULL), -1);
+  CHECK_EQ(s.Add("foobar", NULL), 1);
+
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("foobar", &v), true);
+  CHECK_EQ(v.size(), 2);
+  CHECK_EQ(v[0], 0);
+  CHECK_EQ(v[1], 1);
+
+  v.clear();
+  CHECK_EQ(s.Match("obarfoobaroo", &v), true);
+  CHECK_EQ(v.size(), 2);
+  CHECK_EQ(v[0], 0);
+  CHECK_EQ(v[1], 1);
+
+  v.clear();
+  CHECK_EQ(s.Match("fooba", &v), true);
+  CHECK_EQ(v.size(), 1);
+  CHECK_EQ(v[0], 0);
+
+  v.clear();
+  CHECK_EQ(s.Match("oobar", &v), false);
+  CHECK_EQ(v.size(), 0);
+}
+
 TEST(Set, Anchored) {
   RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
 

diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index 6f6cfc5..ed75309 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc

@@ -123,7 +123,7 @@
   // explicit (?:) in place of non-parenthesized empty strings,
   // to make them easier to spot for other parsers.
   { "(a|b|)", "([a-b]|(?:))" },
-  { "(|)", "((?:)|(?:))" },
+  { "(|)", "()" },
   { "a()", "a()" },
   { "(()|())", "(()|())" },
   { "(a|)", "(a|(?:))" },

diff --git a/re2/tostring.cc b/re2/tostring.cc
index a4aabbe..555524f 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc

@@ -75,6 +75,7 @@
     case kRegexpWordBoundary:
     case kRegexpNoWordBoundary:
     case kRegexpCharClass:
+    case kRegexpHaveMatch:
       nprec = PrecAtom;
       break;
 
@@ -237,7 +238,10 @@
       break;
 
     case kRegexpEndText:
-      t_->append("(?-m:$)");
+      if (re->parse_flags() & Regexp::WasDollar)
+        t_->append("(?-m:$)");
+      else
+        t_->append("\\z");
       break;
 
     case kRegexpWordBoundary:
@@ -272,6 +276,13 @@
     case kRegexpCapture:
       t_->append(")");
       break;
+
+    case kRegexpHaveMatch:
+      // There's no syntax accepted by the parser to generate
+      // this node (it is generated by RE2::Set) so make something
+      // up that is readable but won't compile.
+      t_->append("(?HaveMatch:%d)", re->match_id());
+      break;
   }
 
   // If the parent is an alternation, append the | for it.
commit	4a9f4ca78a66843709737519472becb28c20e20c	[log] [tgz]
author	Russ Cox <[email protected]>	Thu Jul 15 20:38:05 2010 -0700
committer	Russ Cox <[email protected]>	Thu Jul 15 20:38:05 2010 -0700
tree	3400cc04094db3b906d65f4d7e1c89e7148c1420
parent	4c213792cf4aa033242d3eb01841c4ba0bd805d7 [diff]