re2: add CapturingGroupNames

Export from Google source tree.

R=rsc
CC=re2-dev
http://codereview.appspot.com/3981051
diff --git a/re2/re2.cc b/re2/re2.cc
index ee825d0..e4fd575 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -154,6 +154,7 @@
   prog_ = NULL;
   rprog_ = NULL;
   named_groups_ = NULL;
+  group_names_ = NULL;
   num_captures_ = -1;
 
   RegexpStatus status;
@@ -217,7 +218,8 @@
   return rprog_;
 }
 
-static const map<string, int> empty_map;
+static const map<string, int> empty_named_groups;
+static const map<int, string> empty_group_names;
 
 RE2::~RE2() {
   if (suffix_regexp_)
@@ -229,8 +231,10 @@
   delete rprog_;
   if (error_ != &empty_string)
     delete error_;
-  if (named_groups_ != NULL && named_groups_ != &empty_map)
+  if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
     delete named_groups_;
+  if (group_names_ != NULL &&  group_names_ != &empty_group_names)
+    delete group_names_;
 }
 
 int RE2::ProgramSize() const {
@@ -243,15 +247,28 @@
 const map<string, int>&  RE2::NamedCapturingGroups() const {
   MutexLock l(mutex_);
   if (!ok())
-    return empty_map;
+    return empty_named_groups;
   if (named_groups_ == NULL) {
     named_groups_ = suffix_regexp_->NamedCaptures();
     if (named_groups_ == NULL)
-      named_groups_ = &empty_map;
+      named_groups_ = &empty_named_groups;
   }
   return *named_groups_;
 }
 
+// Returns group_names_, computing it if needed.
+const map<int, string>&  RE2::CapturingGroupNames() const {
+  MutexLock l(mutex_);
+  if (!ok())
+    return empty_group_names;
+  if (group_names_ == NULL) {
+    group_names_ = suffix_regexp_->CaptureNames();
+    if (group_names_ == NULL)
+      group_names_ = &empty_group_names;
+  }
+  return *group_names_;
+}
+
 /***** Convenience interfaces *****/
 
 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
diff --git a/re2/re2.h b/re2/re2.h
index f6d1e80..0c73e56 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -415,10 +415,18 @@
   // does not count: if the regexp is "(a)(b)", returns 2.
   int NumberOfCapturingGroups() const;
 
+
   // Return a map from names to capturing indices.
+  // The map records the index of the leftmost group
+  // with the given name.
   // Only valid until the re is deleted.
   const map<string, int>& NamedCapturingGroups() const;
 
+  // Return a map from capturing indices to names.
+  // The map has no entries for unnamed groups.
+  // Only valid until the re is deleted.
+  const map<int, string>& CapturingGroupNames() const;
+
   // General matching routine.
   // Match against text starting at offset startpos.
   // Returns true if match found, false if not.
@@ -688,6 +696,9 @@
   // Map from capture names to indices
   mutable const map<string, int>* named_groups_;
 
+  // Map from capture indices to names
+  mutable const map<int, string>* group_names_;
+
   //DISALLOW_EVIL_CONSTRUCTORS(RE2);
   RE2(const RE2&);
   void operator=(const RE2&);
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 6ecf834..d8d3cda 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -559,6 +559,46 @@
   return w.TakeMap();
 }
 
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+  CaptureNamesWalker() : map_(NULL) {}
+  ~CaptureNamesWalker() { delete map_; }
+
+  map<int, string>* TakeMap() {
+    map<int, string>* m = map_;
+    map_ = NULL;
+    return m;
+  }
+
+  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+    if (re->op() == kRegexpCapture && re->name() != NULL) {
+      // Allocate map once we find a name.
+      if (map_ == NULL)
+        map_ = new map<int, string>;
+
+      (*map_)[re->cap()] = *re->name();
+    }
+    return ignored;
+  }
+
+  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+    return ignored;
+  }
+
+ private:
+  map<int, string>* map_;
+  DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+};
+
+map<int, string>* Regexp::CaptureNames() {
+  CaptureNamesWalker w;
+  w.Walk(this, 0);
+  return w.TakeMap();
+}
+
 // Determines whether regexp matches must be anchored
 // with a fixed string prefix.  If so, returns the prefix and
 // the regexp that remains after the prefix.  The prefix might
diff --git a/re2/regexp.h b/re2/regexp.h
index d844c0c..1aebc16 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -370,6 +370,11 @@
   // The caller is responsible for deleting the map.
   map<string, int>* NamedCaptures();
 
+  // Returns a map from capturing group indices to capturing group
+  // names or NULL if the regexp contains no named capture groups. The
+  // caller is responsible for deleting the map.
+  map<int, string>* CaptureNames();
+
   // Returns a string representation of the current regexp,
   // using as few parentheses as possible.
   string ToString();
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index fe2a9a3..0d33437 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -1270,4 +1270,17 @@
   EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
 }
 
+TEST(RE2, CapturingGroupNames) {
+  // Opening parentheses annotated with group IDs:
+  //      12    3        45   6         7
+  RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
+  EXPECT_TRUE(re.ok());
+  const map<int, string>& have = re.CapturingGroupNames();
+  map<int, string> want;
+  want[3] = "G2";
+  want[6] = "G2";
+  want[7] = "G1";
+  EXPECT_EQ(want, have);
+}
+
 }  // namespace re2
diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc
index 6685c37..f317cbc 100644
--- a/re2/testing/regexp_test.cc
+++ b/re2/testing/regexp_test.cc
@@ -14,7 +14,6 @@
 // Test that overflowed ref counts work.
 TEST(Regexp, BigRef) {
   Regexp* re;
-  
   re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
   for (int i = 0; i < 100000; i++)
     re->Incref();
@@ -40,4 +39,43 @@
   x->Decref();
 }
 
+TEST(Regexp, NamedCaptures) {
+  Regexp* x;
+  RegexpStatus status;
+  x = Regexp::Parse(
+      "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(4, x->NumCaptures());
+  const map<string, int>* have = x->NamedCaptures();
+  EXPECT_TRUE(have != NULL);
+  EXPECT_EQ(2, have->size());  // there are only two named groups in
+                               // the regexp: 'g1' and 'g2'.
+  map<string, int> want;
+  want["g1"] = 1;
+  want["g2"] = 3;
+  EXPECT_EQ(want, *have);
+  x->Decref();
+  delete have;
+}
+
+TEST(Regexp, CaptureNames) {
+  Regexp* x;
+  RegexpStatus status;
+  x = Regexp::Parse(
+      "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(4, x->NumCaptures());
+  const map<int, string>* have = x->CaptureNames();
+  EXPECT_TRUE(have != NULL);
+  EXPECT_EQ(3, have->size());
+  map<int, string> want;
+  want[1] = "g1";
+  want[3] = "g2";
+  want[4] = "g1";
+
+  EXPECT_EQ(want, *have);
+  x->Decref();
+  delete have;
+}
+
 }  // namespace re2