re2: add CapturingGroupNames
Export from Google source tree.
R=rsc
CC=re2-dev
http://codereview.appspot.com/3981051
diff --git a/re2/re2.cc b/re2/re2.cc
index ee825d0..e4fd575 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -154,6 +154,7 @@
prog_ = NULL;
rprog_ = NULL;
named_groups_ = NULL;
+ group_names_ = NULL;
num_captures_ = -1;
RegexpStatus status;
@@ -217,7 +218,8 @@
return rprog_;
}
-static const map<string, int> empty_map;
+static const map<string, int> empty_named_groups;
+static const map<int, string> empty_group_names;
RE2::~RE2() {
if (suffix_regexp_)
@@ -229,8 +231,10 @@
delete rprog_;
if (error_ != &empty_string)
delete error_;
- if (named_groups_ != NULL && named_groups_ != &empty_map)
+ if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
delete named_groups_;
+ if (group_names_ != NULL && group_names_ != &empty_group_names)
+ delete group_names_;
}
int RE2::ProgramSize() const {
@@ -243,15 +247,28 @@
const map<string, int>& RE2::NamedCapturingGroups() const {
MutexLock l(mutex_);
if (!ok())
- return empty_map;
+ return empty_named_groups;
if (named_groups_ == NULL) {
named_groups_ = suffix_regexp_->NamedCaptures();
if (named_groups_ == NULL)
- named_groups_ = &empty_map;
+ named_groups_ = &empty_named_groups;
}
return *named_groups_;
}
+// Returns group_names_, computing it if needed.
+const map<int, string>& RE2::CapturingGroupNames() const {
+ MutexLock l(mutex_);
+ if (!ok())
+ return empty_group_names;
+ if (group_names_ == NULL) {
+ group_names_ = suffix_regexp_->CaptureNames();
+ if (group_names_ == NULL)
+ group_names_ = &empty_group_names;
+ }
+ return *group_names_;
+}
+
/***** Convenience interfaces *****/
bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
diff --git a/re2/re2.h b/re2/re2.h
index f6d1e80..0c73e56 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -415,10 +415,18 @@
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
+
// Return a map from names to capturing indices.
+ // The map records the index of the leftmost group
+ // with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
+ // Return a map from capturing indices to names.
+ // The map has no entries for unnamed groups.
+ // Only valid until the re is deleted.
+ const map<int, string>& CapturingGroupNames() const;
+
// General matching routine.
// Match against text starting at offset startpos.
// Returns true if match found, false if not.
@@ -688,6 +696,9 @@
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
+ // Map from capture indices to names
+ mutable const map<int, string>* group_names_;
+
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
RE2(const RE2&);
void operator=(const RE2&);
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 6ecf834..d8d3cda 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -559,6 +559,46 @@
return w.TakeMap();
}
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+ CaptureNamesWalker() : map_(NULL) {}
+ ~CaptureNamesWalker() { delete map_; }
+
+ map<int, string>* TakeMap() {
+ map<int, string>* m = map_;
+ map_ = NULL;
+ return m;
+ }
+
+ Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
+ map_ = new map<int, string>;
+
+ (*map_)[re->cap()] = *re->name();
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+ return ignored;
+ }
+
+ private:
+ map<int, string>* map_;
+ DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
+};
+
+map<int, string>* Regexp::CaptureNames() {
+ CaptureNamesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
diff --git a/re2/regexp.h b/re2/regexp.h
index d844c0c..1aebc16 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -370,6 +370,11 @@
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
+ // Returns a map from capturing group indices to capturing group
+ // names or NULL if the regexp contains no named capture groups. The
+ // caller is responsible for deleting the map.
+ map<int, string>* CaptureNames();
+
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index fe2a9a3..0d33437 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -1270,4 +1270,17 @@
EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s
}
+TEST(RE2, CapturingGroupNames) {
+ // Opening parentheses annotated with group IDs:
+ // 12 3 45 6 7
+ RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
+ EXPECT_TRUE(re.ok());
+ const map<int, string>& have = re.CapturingGroupNames();
+ map<int, string> want;
+ want[3] = "G2";
+ want[6] = "G2";
+ want[7] = "G1";
+ EXPECT_EQ(want, have);
+}
+
} // namespace re2
diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc
index 6685c37..f317cbc 100644
--- a/re2/testing/regexp_test.cc
+++ b/re2/testing/regexp_test.cc
@@ -14,7 +14,6 @@
// Test that overflowed ref counts work.
TEST(Regexp, BigRef) {
Regexp* re;
-
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
for (int i = 0; i < 100000; i++)
re->Incref();
@@ -40,4 +39,43 @@
x->Decref();
}
+TEST(Regexp, NamedCaptures) {
+ Regexp* x;
+ RegexpStatus status;
+ x = Regexp::Parse(
+ "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(4, x->NumCaptures());
+ const map<string, int>* have = x->NamedCaptures();
+ EXPECT_TRUE(have != NULL);
+ EXPECT_EQ(2, have->size()); // there are only two named groups in
+ // the regexp: 'g1' and 'g2'.
+ map<string, int> want;
+ want["g1"] = 1;
+ want["g2"] = 3;
+ EXPECT_EQ(want, *have);
+ x->Decref();
+ delete have;
+}
+
+TEST(Regexp, CaptureNames) {
+ Regexp* x;
+ RegexpStatus status;
+ x = Regexp::Parse(
+ "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(4, x->NumCaptures());
+ const map<int, string>* have = x->CaptureNames();
+ EXPECT_TRUE(have != NULL);
+ EXPECT_EQ(3, have->size());
+ map<int, string> want;
+ want[1] = "g1";
+ want[3] = "g2";
+ want[4] = "g1";
+
+ EXPECT_EQ(want, *have);
+ x->Decref();
+ delete have;
+}
+
} // namespace re2