factor common prefixes from alternations

Juggle code so that RE2::Set can benefit too.

R=rsc
CC=re2-dev
http://codereview.appspot.com/1740045
diff --git a/re2/compile.cc b/re2/compile.cc
index bbc4607..7b54c2d 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -93,7 +93,7 @@
   }
 
   Prog::Inst* ip = &inst0[l.p>>1];
-  if(l.p&1)
+  if (l.p&1)
     ip->out1_ = l2.p;
   else
     ip->set_out(l2.p);
@@ -132,7 +132,7 @@
   // Compiles alternation of all the re to a new Prog.
   // Each re has a match with an id equal to its index in the vector.
   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                          const vector<Regexp*>& re);
+                          Regexp* re);
 
   // Interface for Regexp::Walker, which helps traverse the Regexp.
   // The walk is purely post-recursive: given the machines for the
@@ -207,7 +207,7 @@
   // Single rune.
   Frag Literal(Rune r, bool foldcase);
 
-  void Setup(Regexp::ParseFlags, int64);
+  void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
   Prog* Finish();
 
   // Returns .* where dot = any byte
@@ -230,6 +230,8 @@
   map<uint64, int> rune_cache_;
   Frag rune_range_;
 
+  RE2::Anchor anchor_;  // anchor mode for RE2::Set
+
   DISALLOW_EVIL_CONSTRUCTORS(Compiler);
 };
 
@@ -478,7 +480,8 @@
   rune_range_.end = nullPatchList;
 }
 
-int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
+int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
+                                     int next) {
   Frag f = ByteRange(lo, hi, foldcase);
   if (next != 0) {
     PatchList::Patch(inst_, f.end, next);
@@ -492,7 +495,9 @@
   // In Latin1 mode, there's no point in caching.
   // In forward UTF-8 mode, only need to cache continuation bytes.
   if (encoding_ == kEncodingLatin1 ||
-      (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf))) {
+      (encoding_ == kEncodingUTF8 &&
+       !reversed_ &&
+       !(0x80 <= lo && hi <= 0xbf))) {
     return UncachedRuneByteSuffix(lo, hi, foldcase, next);
   }
 
@@ -723,6 +728,14 @@
     case kRegexpEmptyMatch:
       return Nop();
 
+    case kRegexpHaveMatch: {
+      Frag f = Match(re->match_id());
+      // Remember unanchored match to end of string.
+      if (anchor_ != RE2::ANCHOR_BOTH)
+        f = Cat(DotStar(), f);
+      return f;
+    }
+
     case kRegexpConcat: {
       Frag f = child_frags[0];
       for (int i = 1; i < nchild_frags; i++)
@@ -896,7 +909,8 @@
   }
 }
 
-void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) {
+void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
+                     RE2::Anchor anchor) {
   prog_->set_flags(flags);
 
   if (flags & Regexp::Latin1)
@@ -926,6 +940,8 @@
 
     max_inst_ = m;
   }
+
+  anchor_ = anchor;
 }
 
 // Compiles re, returning program.
@@ -936,7 +952,7 @@
 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
   Compiler c;
 
-  c.Setup(re->parse_flags(), max_mem);
+  c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
   c.reversed_ = reversed;
 
   // Simplify to remove things like counted repetitions
@@ -1035,30 +1051,22 @@
 
 // Compiles RE set to Prog.
 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                           const vector<Regexp*>& re) {
+                           Regexp* re) {
   Compiler c;
 
-  c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()),
-          options.max_mem());
+  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
+  c.Setup(pf, options.max_mem(), anchor);
 
-  // Accumulate alternation of fragments.
-  Frag all = kNullFrag;
-  for (int i = 0; i < re.size(); i++) {
-    Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_);
-    if (c.failed_)
-      return NULL;
-    switch (anchor) {
-      case RE2::UNANCHORED:
-        f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar());
-        break;
-      case RE2::ANCHOR_START:
-        f = c.Cat(f, c.DotStar());
-        break;
-      case RE2::ANCHOR_BOTH:
-        break;
-    }
-    f = c.Cat(f, c.Match(i));
-    all = c.Alt(all, f);
+  // Compile alternation of fragments.
+  Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
+  re->Decref();
+  if (c.failed_)
+    return NULL;
+
+  if (anchor == RE2::UNANCHORED) {
+    // The trailing .* was added while handling kRegexpHaveMatch.
+    // We just have to add the leading one.
+    all = c.Cat(c.DotStar(), all);
   }
 
   c.prog_->set_start(all.begin);
@@ -1066,14 +1074,6 @@
   c.prog_->set_anchor_start(true);
   c.prog_->set_anchor_end(true);
 
-  // Also create unanchored version, which starts with a .*? loop.
-  if (c.prog_->anchor_start()) {
-    c.prog_->set_start_unanchored(c.prog_->start());
-  } else {
-    Frag unanchored = c.Cat(c.DotStar(), all);
-    c.prog_->set_start_unanchored(unanchored.begin);
-  }
-
   Prog* prog = c.Finish();
   if (prog == NULL)
     return NULL;
@@ -1093,7 +1093,7 @@
 }
 
 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
-                       const vector<Regexp*>& re) {
+                       Regexp* re) {
   return Compiler::CompileSet(options, anchor, re);
 }