factor common prefix strings out of alternation

R=rsc
CC=re2-dev
http://codereview.appspot.com/1665049
diff --git a/re2/parse.cc b/re2/parse.cc
index 0169fa9..36b1c8f 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -102,7 +102,7 @@
 
   // Processes the end of input, returning the final regexp.
   Regexp* DoFinish();
-  
+
   // Finishes the regexp if necessary, preparing it for use
   // in a more complicated expression.
   // If it is a CharClassBuilder, converts into a CharClass.
@@ -133,7 +133,20 @@
   // Parse a Perl flag set or non-capturing group from s.
   bool ParsePerlFlags(StringPiece* s);
 
- private:
+
+  // Returns the leading string that re starts with.
+  // The returned Rune* points into a piece of re,
+  // so it must not be used after the caller calls re->Decref().
+  Rune* LeadingString(Regexp* re, int *nrune, Regexp::ParseFlags *flags);
+
+  // Removes the first n leading runes from the beginning of re.
+  void RemoveLeadingString(Regexp* re, int n);
+
+  // Simplifies an alternation of literal strings by factoring out
+  // common prefixes.
+  int SimplifyAlternation(Regexp** sub, int n, Regexp::ParseFlags altflags,
+                          int maxdepth);
+
   // Finishes the current concatenation,
   // collapsing it into a single regexp on the stack.
   void DoConcatenation();
@@ -148,6 +161,7 @@
   // Maybe concatenate Literals into LiteralString.
   bool MaybeConcatString(int r, ParseFlags flags);
 
+private:
   ParseFlags flags_;
   StringPiece whole_regexp_;
   RegexpStatus* status_;
@@ -162,6 +176,11 @@
 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
 
+// Add a range to the character class, but exclude newline if asked.
+// Also handle case folding.
+static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
+                     Regexp::ParseFlags parse_flags);
+
 Regexp::ParseState::ParseState(ParseFlags flags,
                                const StringPiece& whole_regexp,
                                RegexpStatus* status)
@@ -550,7 +569,8 @@
           // fall through
         case kRegexpCharClass:
           if (r1->op() == kRegexpLiteral)
-            AddLiteral(r3->ccb_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase);
+            AddLiteral(r3->ccb_, r1->rune_,
+                       r1->parse_flags_ & Regexp::FoldCase);
           else if (r1->op() == kRegexpCharClass)
             r3->ccb_->AddCharClass(r1->ccb_);
           if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
@@ -630,6 +650,205 @@
   return FinishRegexp(re);
 }
 
+// Returns the leading string that re starts with.
+// The returned Rune* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Rune* Regexp::ParseState::LeadingString(Regexp* re, int *nrune,
+                                        Regexp::ParseFlags *flags) {
+  while (re->op() == kRegexpConcat && re->nsub() > 0)
+    re = re->sub()[0];
+
+  *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
+
+  if (re->op() == kRegexpLiteral) {
+    *nrune = 1;
+    return &re->rune_;
+  }
+
+  if (re->op() == kRegexpLiteralString) {
+    *nrune = re->nrunes_;
+    return re->runes_;
+  }
+
+  *nrune = 0;
+  return NULL;
+}
+
+// Removes the first n leading runes from the beginning of re.
+// Edits re in place.
+void Regexp::ParseState::RemoveLeadingString(Regexp* re, int n) {
+  while (re->op() == kRegexpConcat && re->nsub() > 0)
+    re = re->sub()[0];
+
+  if (re->op() == kRegexpLiteral) {
+    re->op_ = kRegexpEmptyMatch;
+    return;
+  }
+
+  if (re->op() == kRegexpLiteralString) {
+    if (n >= re->nrunes_) {
+      delete[] re->runes_;
+      re->runes_ = NULL;
+      re->nrunes_ = 0;
+      re->op_ = kRegexpEmptyMatch;
+    } else if (n == re->nrunes_ - 1) {
+      Rune rune = re->runes_[re->nrunes_ - 1];
+      delete[] re->runes_;
+      re->runes_ = NULL;
+      re->nrunes_ = 0;
+      re->rune_ = rune;
+      re->op_ = kRegexpLiteral;
+
+    } else {
+      re->nrunes_ -= n;
+      memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
+    }
+    return;
+  }
+}
+
+// Simplifies an alternation of literal strings by factoring out
+// common prefixes.  For example,
+//     ABC|ABD|AEF|BCX|BCY
+// simplifies to
+//     A(B(C|D)|EF)|BC(X|Y)
+// which the normal parse state routines will further simplify to
+//     A(B[CD]|EF)|BC[XY]
+//
+// Rewrites sub to contain simplified list to alternate and returns
+// the new length of sub.  Adjusts reference counts accordingly
+// (incoming sub[i] decremented, outgoing sub[i] incremented).
+
+// It's too much of a pain to explicitly write out the recursion,
+// so instead we let the caller specify a maximum depth and
+// don't simplify beyond that.  There are around 15 words of local
+// variables and parameters in the frame, so allowing 8 levels
+// on a 64-bit machine is still less than a kilobyte of stack and
+// probably enough benefit for practical uses.
+const int kSimplifyAlternationMaxDepth = 8;
+
+int Regexp::ParseState::SimplifyAlternation(
+    Regexp** sub, int n,
+    Regexp::ParseFlags altflags,
+    int maxdepth) {
+
+  if (maxdepth <= 0)
+    return n;
+
+  // Round 1: Factor out common prefixes.
+  Rune *rune = NULL;
+  int nrune = 0;
+  Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
+  int start = 0;
+  int out = 0;
+  for (int i = 0; i <= n; i++) {
+    // Invariant: what was in sub[0:start] has been Decref'ed
+    // and that space has been reused for sub[0:out] (out <= start).
+    //
+    // Invariant: sub[start:i] consists of regexps that all begin
+    // with the string rune[0:nrune].
+
+    Rune* rune_i = NULL;
+    int nrune_i = 0;
+    Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
+    if (i < n) {
+      rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i);
+      if (runeflags_i == runeflags) {
+        int same = 0;
+        while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
+          same++;
+        if (same > 0) {
+          // Matches at least one rune in current range.  Keep going around.
+          nrune = same;
+          continue;
+        }
+      }
+    }
+
+    // Found end of a run with common leading literal string:
+    // sub[start:i] all begin with rune[0:nrune] but sub[i]
+    // does not even begin with rune[0].
+    //
+    // Factor out common string and append factored expression to sub[0:out].
+    if (i == start) {
+      // Nothing to do - first iteration.
+    } else if (i == start+1) {
+      // Just one: don't bother factoring.
+      sub[out++] = sub[start];
+    } else {
+      // Construct factored form: prefix(suffix1|suffix2|...)
+      Regexp* x[2];  // x[0] = prefix, x[1] = suffix1|suffix2|...
+      x[0] = LiteralString(rune, nrune, runeflags);
+      for (int j = start; j < i; j++)
+        RemoveLeadingString(sub[j], nrune);
+      int nn = SimplifyAlternation(sub + start, i - start, altflags,
+                                   maxdepth - 1);
+      x[1] = Alternate(sub + start, nn, altflags);
+      sub[out++] = Concat(x, 2, altflags);
+    }
+
+    // Prepare for next round (if there is one).
+    if (i < n) {
+      start = i;
+      rune = rune_i;
+      nrune = nrune_i;
+      runeflags = runeflags_i;
+    }
+  }
+  n = out;
+
+  // Round 2: Collapse runs of single literals into character classes.
+  start = 0;
+  out = 0;
+  for (int i = 0; i <= n; i++) {
+    // Invariant: what was in sub[0:start] has been Decref'ed
+    // and that space has been reused for sub[0:out] (out <= start).
+    //
+    // Invariant: sub[start:i] consists of regexps that are either
+    // literal runes or character classes.
+
+    if (i < n &&
+        (sub[i]->op() == kRegexpLiteral ||
+         sub[i]->op() == kRegexpCharClass))
+      continue;
+
+    // sub[i] is not a char or char class;
+    // emit char class for sub[start:i]...
+    if (i == start) {
+      // Nothing to do.
+    } else if (i == start+1) {
+      // Char class (or not) of one.
+      sub[out++] = sub[start];
+    } else {
+      // Make new char class.
+      CharClassBuilder ccb;
+      for (int j = start; j < i; j++) {
+        Regexp* re = sub[j];
+        if (re->op() == kRegexpCharClass) {
+          CharClass* cc = re->cc();
+          for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+            ccb.AddRange(it->lo, it->hi);
+        } else if (re->op() == kRegexpLiteral) {
+          AddRange(&ccb, re->rune(), re->rune(), re->parse_flags());
+        } else {
+          LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
+                      << re->ToString();
+        }
+        re->Decref();
+      }
+      sub[out++] = NewCharClass(ccb.GetCharClass(), altflags);
+    }
+
+    // ... and then emit sub[i].
+    if (i < n)
+      sub[out++] = sub[i];
+    start = i+1;
+  }
+  n = out;
+
+  return n;
+}
+
 // Collapse the regexps on top of the stack, down to the
 // first marker, into a new op node (op == kRegexpAlternate
 // or op == kRegexpConcat).
@@ -666,7 +885,10 @@
       subs[--i] = FinishRegexp(sub);
     }
   }
-  
+
+  if (op == kRegexpAlternate)
+    n = SimplifyAlternation(subs, n, flags_, kSimplifyAlternationMaxDepth);
+
   Regexp* re = ConcatOrAlternate(op, subs, n, flags_);
   delete[] subs;
   re->simple_ = re->ComputeSimple();
@@ -698,8 +920,8 @@
 }
 
 // Incremental conversion of concatenated literals into strings.
-// If top of stack is (literal, literal) or (literal, string) or (string, literal)
-// or (string, string), collapse into single string.
+// If top two elements on stack are both literal or string,
+// collapse into single string.
 // Don't walk down the stack -- the parser calls this frequently
 // enough that below the bottom two is known to be collapsed.
 // Only called when another regexp is about to be pushed
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 7b21fbe..e988f8a 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -65,7 +65,7 @@
 int Regexp::Ref() {
   if (ref_ < kMaxRef)
     return ref_;
-  
+
   MutexLock l(&ref_mutex);
   return ref_map[this];
 }
@@ -189,6 +189,9 @@
 }
 
 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, ParseFlags flags) {
+  if (nsub == 1)
+    return sub[0];
+
   if (nsub > kMaxNsub) {
     // Too many subexpressions to fit in a single Regexp.
     // Make a two-level tree.  Two levels gets us to 65535^2.
@@ -574,7 +577,7 @@
   }
 
   for (;;) {
-    
+
     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
     if (it == end())
       break;
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index a78395a..7e0de62 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -191,12 +191,26 @@
   { "[a\\n]", "cc{0xa 0x61}" },
 };
 
-
 // Test that parsing without MatchNL works.
 TEST(TestParse, NoMatchNL) {
   TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
 }
 
+Test prefix_tests[] = {
+  { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
+  { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
+  { "abc|abd|aef|bcx|bcy",
+    "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
+      "cat{str{bc}cc{0x78-0x79}}}" },
+  { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
+  { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
+};
+
+// Test that prefix factoring works.
+TEST(TestParse, Prefix) {
+  TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
+}
+
 // Invalid regular expressions
 const char* badtests[] = {
   "(",