re2: memory diet
Cut unnecessary fields from Regexp and order to align better.
Combine smallsub and sub into one field.
Changes sizeof(Regexp) - a single node - from 104 to 40 bytes.
Use sorted array as permanent representation
of character class, avoiding the bloat of STL set<>.
Combined with CL 1129043, cuts storage used
by a typical RE2 object by about 60%.
R=r
CC=re2-dev
http://codereview.appspot.com/1174041
diff --git a/re2/parse.cc b/re2/parse.cc
index 8471b3f..bbb8eb0 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -102,6 +102,11 @@
// Processes the end of input, returning the final regexp.
Regexp* DoFinish();
+
+ // Finishes the regexp if necessary, preparing it for use
+ // in a more complicated expression.
+ // If it is a CharClassBuilder, converts into a CharClass.
+ Regexp* FinishRegexp(Regexp*);
// These routines don't manipulate the parse stack
// directly, but they do need to look at flags_.
@@ -174,6 +179,24 @@
}
}
+// Finishes the regexp if necessary, preparing it for use in
+// a more complex expression.
+// If it is a CharClassBuilder, converts into a CharClass.
+Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+ re->down_ = NULL;
+
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
+ CharClassBuilder* ccb = re->ccb_;
+ re->ccb_ = NULL;
+ re->cc_ = ccb->GetCharClass();
+ delete ccb;
+ }
+
+ return re;
+}
+
// Pushes the given regular expression onto the stack.
// Could check for too much memory used here.
bool Regexp::ParseState::PushRegexp(Regexp* re) {
@@ -185,14 +208,14 @@
// analysis does better with fewer character classes.
// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
if (re->op_ == kRegexpCharClass) {
- if (re->cc_->size() == 1) {
- Rune r = re->cc_->begin()->lo;
+ if (re->ccb_->size() == 1) {
+ Rune r = re->ccb_->begin()->lo;
re->Decref();
re = new Regexp(kRegexpLiteral, flags_);
re->rune_ = r;
- } else if (re->cc_->size() == 2) {
- Rune r = re->cc_->begin()->lo;
- if ('A' <= r && r <= 'Z' && re->cc_->Contains(r + 'a' - 'A')) {
+ } else if (re->ccb_->size() == 2) {
+ Rune r = re->ccb_->begin()->lo;
+ if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
re->Decref();
re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
re->rune_ = r + 'a' - 'A';
@@ -279,7 +302,7 @@
// Add lo-hi to the class, along with their fold-equivalent characters.
// If lo-hi is already in the class, assume that the fold-equivalent
// chars are there too, so there's no work to do.
-static void AddFoldedRange(CharClass* cc, Rune lo, Rune hi, int depth) {
+static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
// AddFoldedRange calls itself recursively for each rune in the fold cycle.
// Most folding cycles are small: there aren't any bigger than four in the
// current Unicode tables. make_unicode_casefold.py checks that
@@ -335,14 +358,15 @@
// Do case folding if needed.
if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
Rune r1 = r;
do {
if (!(flags_ & NeverNL) || r != '\n') {
- re->cc()->AddRange(r, r);
+ re->ccb_->AddRange(r, r);
}
r = CycleFoldRune(r);
} while (r != r1);
- re->cc()->RemoveAbove(rune_max_);
+ re->ccb_->RemoveAbove(rune_max_);
return PushRegexp(re);
}
@@ -394,8 +418,9 @@
return PushSimpleOp(kRegexpAnyChar);
// Rewrite . into [^\n]
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->cc_->AddRange(0, '\n' - 1);
- re->cc_->AddRange('\n' + 1, rune_max_);
+ re->ccb_ = new CharClassBuilder;
+ re->ccb_->AddRange(0, '\n' - 1);
+ re->ccb_->AddRange('\n' + 1, rune_max_);
return PushRegexp(re);
}
@@ -420,9 +445,9 @@
fl = fl ^ NonGreedy;
Regexp* re = new Regexp(op, fl);
re->AllocSub(1);
- re->sub_[0] = stacktop_;
- re->simple_ = re->ComputeSimple();
re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
+ re->simple_ = re->ComputeSimple();
stacktop_ = re;
return true;
}
@@ -449,16 +474,17 @@
re->min_ = min;
re->max_ = max;
re->AllocSub(1);
- re->sub_[0] = stacktop_;
+ re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
re->simple_ = re->ComputeSimple();
- re->down_ = stacktop_->down_;
stacktop_ = re;
return true;
}
-// Pseudo-operators marking left paren and pipe operators
-// in the regexp stack.
+// Pseudo-operators - only on parse stack.
+
+// Markers for left paren and pipe operators in the regexp stack.
const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
@@ -485,7 +511,7 @@
}
// Adds r to cc, along with r's upper case if foldascii is set.
-static void AddLiteral(CharClass* cc, Rune r, bool foldascii) {
+static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
cc->AddRange(r, r);
if (foldascii && 'a' <= r && r <= 'z')
cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
@@ -513,21 +539,24 @@
r1->op() == kRegexpCharClass ||
r1->op() == kRegexpAnyChar) &&
(r3 = r2->down_) != NULL) {
+ Rune rune;
switch (r3->op()) {
case kRegexpLiteral: // convert to char class
+ rune = r3->rune_;
r3->op_ = kRegexpCharClass;
- r3->cc_ = new CharClass;
- AddLiteral(r3->cc_, r3->rune_, r3->parse_flags_ & Regexp::FoldCase);
+ r3->cc_ = NULL;
+ r3->ccb_ = new CharClassBuilder;
+ AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
// fall through
case kRegexpCharClass:
if (r1->op() == kRegexpLiteral)
- AddLiteral(r3->cc_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase);
+ AddLiteral(r3->ccb_, r1->rune_, r1->parse_flags_ & Regexp::FoldCase);
else if (r1->op() == kRegexpCharClass)
- r3->cc_->AddCharClass(r1->cc());
- if (r1->op() == kRegexpAnyChar || r3->cc_->full()) {
+ r3->ccb_->AddCharClass(r1->ccb_);
+ if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
+ delete r3->ccb_;
+ r3->ccb_ = NULL;
r3->op_ = kRegexpAnyChar;
- delete r3->cc_;
- r3->cc_ = NULL;
}
// fall through
case kRegexpAnyChar:
@@ -579,7 +608,7 @@
re->op_ = kRegexpCapture;
// re->cap_ is already set
re->AllocSub(1);
- re->sub_[0] = r1;
+ re->sub()[0] = FinishRegexp(r1);
re->simple_ = re->ComputeSimple();
} else {
re->Decref();
@@ -598,7 +627,7 @@
return NULL;
}
stacktop_ = NULL;
- return re;
+ return FinishRegexp(re);
}
// Collapse the regexps on top of the stack, down to the
@@ -623,19 +652,23 @@
return;
// Construct op (alternation or concatenation), flattening op of op.
- Regexp* re = new Regexp(op, flags_);
- re->AllocSub(n);
+ Regexp** subs = new Regexp*[n];
next = NULL;
+ int i = n;
for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
- next = sub->down_; // in case we sub->Decref().
+ next = sub->down_;
if (sub->op_ == op) {
+ Regexp** sub_subs = sub->sub();
for (int k = sub->nsub_ - 1; k >= 0; k--)
- re->sub_[--n] = sub->sub_[k]->Incref();
+ subs[--i] = sub_subs[k]->Incref();
sub->Decref();
} else {
- re->sub_[--n] = sub;
+ subs[--i] = FinishRegexp(sub);
}
}
+
+ Regexp* re = ConcatOrAlternate(op, subs, n, flags_);
+ delete[] subs;
re->simple_ = re->ComputeSimple();
re->down_ = next;
stacktop_ = re;
@@ -665,8 +698,8 @@
}
// Incremental conversion of concatenated literals into strings.
-// If top of stack is literal, literal or literal, string or string, literal
-// or string, string, collapse into single string.
+// If top of stack is (literal, literal) or (literal, string) or (string, literal)
+// or (string, string), collapse into single string.
// Don't walk down the stack -- the parser calls this frequently
// enough that below the bottom two is known to be collapsed.
// Only called when another regexp is about to be pushed
@@ -689,8 +722,11 @@
if (re2->op_ == kRegexpLiteral) {
// convert into string
+ Rune rune = re2->rune_;
re2->op_ = kRegexpLiteralString;
- re2->AddRuneToString(re2->rune_);
+ re2->nrunes_ = 0;
+ re2->runes_ = NULL;
+ re2->AddRuneToString(rune);
}
// push re1 into re2.
@@ -983,7 +1019,7 @@
// Add a range to the character class, but exclude newline if asked.
// Also handle case folding.
-static void AddRange(CharClass* cc, Rune lo, Rune hi,
+static void AddRange(CharClassBuilder* cc, Rune lo, Rune hi,
Regexp::ParseFlags parse_flags) {
// Take out \n if the flags say so.
@@ -1037,7 +1073,7 @@
}
// Add a UGroup to the character class.
-static void AddUGroup(CharClass *cc, UGroup *g,
+static void AddUGroup(CharClassBuilder *cc, UGroup *g,
Regexp::ParseFlags parse_flags) {
for (int i = 0; i < g->nr16; i++) {
AddRange(cc, g->r16[i].lo, g->r16[i].hi, parse_flags);
@@ -1048,7 +1084,7 @@
}
// Add the negation of a UGroup to the character class.
-static void AddNegatedUGroup(CharClass* cc, UGroup *g,
+static void AddNegatedUGroup(CharClassBuilder* cc, UGroup *g,
Regexp::ParseFlags parse_flags) {
int next = 0;
for (int i = 0; i < g->nr16; i++) {
@@ -1095,7 +1131,7 @@
// Maybe parses a Unicode character group like \p{Han} or \P{Han}
// (the latter is a negated group).
ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClass *cc,
+ CharClassBuilder *cc,
RegexpStatus* status) {
// Decide whether to parse.
if (!(parse_flags & Regexp::UnicodeGroups))
@@ -1160,7 +1196,7 @@
// Sets *s to span the remainder of the string.
// Adds the ranges corresponding to the class to ranges.
static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClass *cc,
+ CharClassBuilder *cc,
RegexpStatus* status) {
// Check begins with [:
const char* p = s->data();
@@ -1257,6 +1293,7 @@
}
bool negated = false;
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
s->remove_prefix(1); // '['
if (s->size() > 0 && (*s)[0] == '^') {
s->remove_prefix(1); // '^'
@@ -1264,7 +1301,7 @@
if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
// If NL can't match implicitly, then pretend
// negated classes include a leading \n.
- re->cc_->AddRange('\n', '\n');
+ re->ccb_->AddRange('\n', '\n');
}
}
bool first = true; // ] is okay as first char in class
@@ -1290,7 +1327,7 @@
// Look for [:alnum:] etc.
if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
- switch (ParseCCName(s, flags_, re->cc_, status)) {
+ switch (ParseCCName(s, flags_, re->ccb_, status)) {
case kParseOk:
continue;
case kParseError:
@@ -1305,7 +1342,7 @@
if (s->size() > 2 &&
(*s)[0] == '\\' &&
((*s)[1] == 'p' || (*s)[1] == 'P')) {
- switch (ParseUnicodeGroup(s, flags_, re->cc_, status)) {
+ switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
case kParseOk:
continue;
case kParseError:
@@ -1319,7 +1356,7 @@
// Look for Perl character class symbols (extension).
UGroup *g = MaybeParsePerlCCEscape(s, flags_);
if (g != NULL) {
- AddUGroup(re->cc_, g, flags_);
+ AddUGroup(re->ccb_, g, flags_);
continue;
}
@@ -1334,7 +1371,7 @@
// Regexp::ClassNL is set. In an explicit range or singleton
// like we just parsed, we do not filter \n out, so set ClassNL
// in the flags.
- AddRange(re->cc_, rr.lo, rr.hi, flags_ | Regexp::ClassNL);
+ AddRange(re->ccb_, rr.lo, rr.hi, flags_ | Regexp::ClassNL);
}
if (s->size() == 0) {
status->set_code(kRegexpMissingBracket);
@@ -1345,8 +1382,8 @@
s->remove_prefix(1); // ']'
if (negated)
- re->cc_->Negate();
- re->cc_->RemoveAbove(rune_max_);
+ re->ccb_->Negate();
+ re->ccb_->RemoveAbove(rune_max_);
*out_re = re;
return true;
@@ -1752,7 +1789,8 @@
if (t[1] == 'p' || t[1] == 'P') {
Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
- switch (ParseUnicodeGroup(&t, ps.flags(), re->cc_, status)) {
+ re->ccb_ = new CharClassBuilder;
+ switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
case kParseOk:
if (!ps.PushRegexp(re))
return NULL;
@@ -1769,7 +1807,8 @@
UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
if (g != NULL) {
Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
- AddUGroup(re->cc_, g, ps.flags());
+ re->ccb_ = new CharClassBuilder;
+ AddUGroup(re->ccb_, g, ps.flags());
if (!ps.PushRegexp(re))
return NULL;
break;
diff --git a/re2/re2.cc b/re2/re2.cc
index 8f96f83..e2b4382 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -110,6 +110,7 @@
prog_ = NULL;
rprog_ = NULL;
named_groups_ = NULL;
+ num_captures_ = -1;
RegexpStatus status;
int flags = Regexp::ClassNL;
@@ -796,7 +797,11 @@
int RE2::NumberOfCapturingGroups() const {
if (suffix_regexp_ == NULL)
return -1;
- return suffix_regexp_->NumCaptures();
+ ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
+ " multiple threads end up doing the same work in parallel.");
+ if (num_captures_ == -1)
+ num_captures_ = suffix_regexp_->NumCaptures();
+ return num_captures_;
}
// Checks that the rewrite string is well-formed with respect to this
diff --git a/re2/re2.h b/re2/re2.h
index 3676875..c0d4902 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -675,6 +675,7 @@
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
+ mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
diff --git a/re2/regexp.cc b/re2/regexp.cc
index bbbb1ec..7b21fbe 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -3,7 +3,7 @@
// license that can be found in the LICENSE file.
// Regular expression representation.
-// Tested by parse_unittest.cc
+// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
@@ -15,29 +15,13 @@
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
- parse_flags_(parse_flags),
- ref_(1),
simple_(false),
- num_captures_(-1),
- cap_(-1),
- max_(0),
- min_(0),
- nrunes_(0),
+ parse_flags_(static_cast<uint16>(parse_flags)),
+ ref_(1),
nsub_(0),
- rune_(0),
- cc_(NULL),
- sub_(NULL),
- runes_(NULL),
- name_(NULL),
down_(NULL) {
-
- switch (op_) {
- default:
- break;
- case kRegexpCharClass:
- cc_ = new CharClass;
- break;
- }
+ subone_ = NULL;
+ memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
@@ -46,29 +30,86 @@
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
- if (sub_)
+ if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
- delete cc_;
- delete name_;
- delete[] runes_;
+ switch (op_) {
+ default:
+ break;
+ case kRegexpCapture:
+ delete name_;
+ break;
+ case kRegexpLiteralString:
+ delete[] runes_;
+ break;
+ case kRegexpCharClass:
+ cc_->Delete();
+ delete ccb_;
+ break;
+ }
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
- if (sub_ == NULL) {
+ if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
+static map<Regexp*, int> ref_map;
+static Mutex ref_mutex;
+
+int Regexp::Ref() {
+ if (ref_ < kMaxRef)
+ return ref_;
+
+ MutexLock l(&ref_mutex);
+ return ref_map[this];
+}
+
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+ if (ref_ >= kMaxRef-1) {
+ // Store ref count in overflow map.
+ MutexLock l(&ref_mutex);
+ if (ref_ == kMaxRef) { // already overflowed
+ ref_map[this]++;
+ return this;
+ }
+ // overflowing now
+ ref_map[this] = kMaxRef;
+ ref_ = kMaxRef;
+ return this;
+ }
+
+ ref_++;
+ return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+ if (ref_ == kMaxRef) {
+ // Ref count is stored in overflow map.
+ MutexLock l(&ref_mutex);
+ int r = ref_map[this] - 1;
+ if (r < kMaxRef) {
+ ref_ = r;
+ ref_map.erase(this);
+ } else {
+ ref_map[this] = r;
+ }
+ return;
+ }
+ ref_--;
+ if (ref_ == 0)
+ Destroy();
+}
+
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
- if (ref_ < 0)
- LOG(DFATAL) << "Bad reference count " << ref_;
-
if (QuickDestroy())
return;
@@ -81,17 +122,22 @@
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
- if (re->sub_ != NULL) {
+ if (re->nsub_ > 0) {
+ Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
- Regexp* sub = re->sub_[i];
- if (--sub->ref_ == 0 && !sub->QuickDestroy()) {
+ Regexp* sub = subs[i];
+ if (sub->ref_ == kMaxRef)
+ sub->Decref();
+ else
+ --sub->ref_;
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
- if (re->sub_ != re->smallsub_)
- delete[] re->sub_;
- re->sub_ = NULL;
+ if (re->nsub_ > 1)
+ delete[] subs;
+ re->nsub_ = 0;
}
delete re;
}
@@ -120,7 +166,7 @@
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
re->AllocSub(1);
- re->sub_[0] = sub;
+ re->sub()[0] = sub;
return re;
}
@@ -129,7 +175,7 @@
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
- re->sub_[0] = sub;
+ re->sub()[0] = sub;
return re;
}
@@ -138,30 +184,44 @@
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
- re->sub_[0] = sub;
+ re->sub()[0] = sub;
+ return re;
+}
+
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, ParseFlags flags) {
+ if (nsub > kMaxNsub) {
+ // Too many subexpressions to fit in a single Regexp.
+ // Make a two-level tree. Two levels gets us to 65535^2.
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nbigsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nbigsub - 1; i++)
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags);
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nsub; i++)
+ subs[i] = sub[i];
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
- Regexp* re = new Regexp(kRegexpConcat, flags);
- re->AllocSub(nsub);
- for (int i = 0; i < nsub; i++)
- re->sub_[i] = sub[i];
- return re;
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
- Regexp* re = new Regexp(kRegexpAlternate, flags);
- re->AllocSub(nsub);
- for (int i = 0; i < nsub; i++)
- re->sub_[i] = sub[i];
- return re;
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
- re->sub_[0] = sub;
+ re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
@@ -169,7 +229,7 @@
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
- re->sub_[0] = sub;
+ re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
@@ -194,7 +254,6 @@
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
- delete re->cc_;
re->cc_ = cc;
return re;
}
@@ -264,14 +323,9 @@
};
int Regexp::NumCaptures() {
- ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
- " multiple threads end up doing the same work in parallel.");
- if (num_captures_ == -1) {
- NumCapturesWalker w;
- w.Walk(this, 0);
- num_captures_ = w.ncapture();
- }
- return num_captures_;
+ NumCapturesWalker w;
+ w.Walk(this, 0);
+ return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
@@ -335,21 +389,22 @@
// Some number of anchors.
int i = 0;
- while (i < nsub_ && sub_[i]->op_ == kRegexpBeginText)
+ Regexp** sub = this->sub();
+ while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0)
return false;
// Then a literal or a concatenation.
if (i < nsub_) {
- Regexp* re = sub_[i];
+ Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
- if (re->parse_flags_ & Latin1) {
+ if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
@@ -370,7 +425,7 @@
break;
case kRegexpLiteral:
- if ((re->parse_flags_ & Latin1) || re->rune_ < Runeself) {
+ if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
} else {
char buf[UTFmax];
@@ -378,7 +433,7 @@
}
break;
}
- *foldcase = (sub_[i]->parse_flags_ & FoldCase);
+ *foldcase = (sub[i]->parse_flags() & FoldCase);
i++;
}
@@ -386,29 +441,29 @@
Regexp* re;
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
- sub_[j]->Incref();
- re = Concat(sub_ + i, nsub_ - i, parse_flags_);
+ sub[j]->Incref();
+ re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
- re = new Regexp(kRegexpEmptyMatch, parse_flags_);
+ re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
-// Character class is a balanced binary tree (STL set)
+// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
-CharClass::CharClass() {
+CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
-bool CharClass::AddRange(Rune lo, Rune hi) {
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
@@ -474,24 +529,22 @@
return true;
}
-bool CharClass::AddCharClass(CharClass *cc) {
- bool added = false;
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
- added |= AddRange(it->lo, it->hi);
- return added;
+ AddRange(it->lo, it->hi);
}
-bool CharClass::Contains(Rune r) {
+bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
-bool CharClass::FoldsASCII() {
+bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
-CharClass* CharClass::Copy() {
- CharClass* cc = new CharClass;
+CharClassBuilder* CharClassBuilder::Copy() {
+ CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
@@ -500,7 +553,9 @@
return cc;
}
-void CharClass::RemoveAbove(Rune r) {
+
+
+void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
@@ -519,6 +574,7 @@
}
for (;;) {
+
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
@@ -533,7 +589,7 @@
}
}
-void CharClass::Negate() {
+void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
@@ -567,4 +623,75 @@
nrunes_ = Runemax+1 - nrunes_;
}
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
+CharClass* CharClass::New(int maxranges) {
+ CharClass* cc;
+ uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
+ cc = reinterpret_cast<CharClass*>(data);
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+ cc->nranges_ = 0;
+ cc->folds_ascii_ = false;
+ cc->nrunes_ = 0;
+ return cc;
+}
+
+void CharClass::Delete() {
+ if (this == NULL)
+ return;
+ uint8 *data = reinterpret_cast<uint8*>(this);
+ delete[] data;
+}
+
+CharClass* CharClass::Negate() {
+ CharClass* cc = CharClass::New(nranges_+1);
+ cc->folds_ascii_ = folds_ascii_;
+ cc->nrunes_ = Runemax + 1 - nrunes_;
+ int n = 0;
+ int nextlo = 0;
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
+ if (it->lo == nextlo) {
+ nextlo = it->hi + 1;
+ } else {
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+ nextlo = it->hi + 1;
+ }
+ }
+ if (nextlo <= Runemax)
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+ cc->nranges_ = n;
+ return cc;
+}
+
+bool CharClass::Contains(Rune r) {
+ RuneRange* rr = ranges_;
+ int n = nranges_;
+ while (n > 0) {
+ int m = n/2;
+ if (rr[m].hi < r) {
+ rr += m+1;
+ n -= m+1;
+ } else if (r < rr[m].lo) {
+ n = m;
+ } else { // rr[m].lo <= r && r <= rr[m].hi
+ return true;
+ }
+ }
+ return false;
+}
+
+CharClass* CharClassBuilder::GetCharClass() {
+ CharClass* cc = CharClass::New(ranges_.size());
+ int n = 0;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_[n++] = *it;
+ cc->nranges_ = n;
+ DCHECK_LE(n, ranges_.size());
+ cc->nrunes_ = nrunes_;
+ cc->folds_ascii_ = FoldsASCII();
+ return cc;
+}
+
} // namespace re2
diff --git a/re2/regexp.h b/re2/regexp.h
index 749418e..7d0c5f5 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -146,7 +146,7 @@
// Matches empty string at end of text.
kRegexpEndText,
- // Matches character class given by ranges_.
+ // Matches character class given by cc_.
kRegexpCharClass,
kMaxRegexpOp = kRegexpCharClass,
@@ -228,12 +228,44 @@
}
};
-// Character class: contains non-overlapping, non-abutting RuneRanges.
-typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+class CharClassBuilder;
class CharClass {
public:
- CharClass();
+ void Delete();
+
+ typedef RuneRange* iterator;
+ iterator begin() { return ranges_; }
+ iterator end() { return ranges_ + nranges_; }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+ bool FoldsASCII() { return folds_ascii_; }
+
+ bool Contains(Rune r);
+ CharClass* Negate();
+
+ private:
+ CharClass(); // not implemented
+ ~CharClass(); // not implemented
+ static CharClass* New(int maxranges);
+
+ friend class CharClassBuilder;
+
+ bool folds_ascii_;
+ int nrunes_;
+ RuneRange *ranges_;
+ int nranges_;
+ DISALLOW_EVIL_CONSTRUCTORS(CharClass);
+};
+
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
+typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
+
+class CharClassBuilder {
+ public:
+ CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
@@ -243,14 +275,14 @@
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
- const RuneRangeSet& ranges() { return ranges_; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
- CharClass* Copy();
- bool AddCharClass(CharClass* cc);
+ CharClassBuilder* Copy();
+ void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
+ CharClass* GetCharClass();
private:
static const uint32 AlphaMask = (1<<26) - 1;
@@ -258,7 +290,7 @@
uint32 lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
- DISALLOW_EVIL_CONSTRUCTORS(CharClass);
+ DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
};
class Regexp {
@@ -302,29 +334,37 @@
UnicodeGroups,
// Internal use only.
- WasDollar = 1<<30, // on kRegexpEndText: was $ in regexp text
+ WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
};
// Get. No set, Regexps are logically immutable once created.
- RegexpOp op() { return op_; }
- Regexp** sub() { return sub_; }
+ RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
- int min() { return min_; }
- int max() { return max_; }
- Rune rune() { return rune_; }
- CharClass* cc() { return cc_; }
- enum ParseFlags parse_flags() { return parse_flags_; }
- int cap() { return cap_; }
- const string* name() { return name_; }
bool simple() { return simple_; }
- Rune* runes() { return runes_; }
- int nrunes() { return nrunes_; }
+ enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
+ int Ref(); // For testing.
+
+ Regexp** sub() {
+ if(nsub_ <= 1)
+ return &subone_;
+ else
+ return submany_;
+ }
+
+ int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
+ int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
+ Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
+ CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
+ int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
+ const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
+ Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
+ int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
// Increments reference count, returns object as convenience.
- Regexp* Incref() { ref_++; return this; }
+ Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
- void Decref() { if(--ref_ <= 0) Destroy(); }
+ void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
@@ -426,12 +466,17 @@
// Computes whether Regexp is already simple.
bool ComputeSimple();
+ // Constructor that generates a concatenation or alternation,
+ // enforcing the limit on the number of subexpressions for
+ // a particular Regexp.
+ static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, ParseFlags flags);
+
// Allocate space for n sub-regexps.
void AllocSub(int n) {
- if (n <= arraysize(smallsub_))
- sub_ = smallsub_;
- else
- sub_ = new Regexp*[n];
+ if (n < 0 || static_cast<uint16>(n) != n)
+ LOG(FATAL) << "Cannot AllocSub " << n;
+ if (n > 1)
+ submany_ = new Regexp*[n];
nsub_ = n;
}
@@ -439,41 +484,71 @@
void AddRuneToString(Rune r);
// Operator. See description of operators above.
- enum RegexpOp op_;
+ // uint8 instead of RegexpOp to control space usage.
+ uint8 op_;
+
+ // Is this regexp structure already simple
+ // (has it been returned by Simplify)?
+ // uint8 instead of bool to control space usage.
+ uint8 simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
- ParseFlags parse_flags_;
+ // uint16 instead of ParseFlags to control space usage.
+ uint16 parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
- int ref_;
+ // uint16 to control space usage.
+ // The standard regexp routines will never generate a
+ // ref greater than the maximum repeat count (100),
+ // but even so, Incref and Decref consult an overflow map
+ // when ref_ reaches kMaxRef.
+ uint16 ref_;
+ static const uint16 kMaxRef = 0xffff;
- // Is this regexp structure already simple
- // (has it been returned by Simplify)?
- bool simple_;
-
- // How many capturing groups? -1 if unknown.
- int num_captures_;
-
- // Arguments to operator. See description of operators above.
- int cap_;
- int max_;
- int min_;
- int nrunes_;
- int nsub_;
- Rune rune_;
-
- CharClass* cc_;
- Regexp* smallsub_[2];
- Regexp** sub_;
- Rune* runes_;
- string* name_;
+ // Subexpressions.
+ // uint16 to control space usage.
+ // Concat and Alternate handle larger numbers of subexpressions
+ // by building concatenation or alternation trees.
+ // Other routines should call Concat or Alternate instead of
+ // filling in sub() by hand.
+ uint16 nsub_;
+ static const uint16 kMaxNsub = 0xffff;
+ union {
+ Regexp** submany_; // if nsub_ > 1
+ Regexp* subone_; // if nsub_ == 1
+ };
// Extra space for parse and teardown stacks.
Regexp* down_;
+ // Arguments to operator. See description of operators above.
+ union {
+ struct { // Repeat
+ int max_;
+ int min_;
+ };
+ struct { // Capture
+ int cap_;
+ string* name_;
+ };
+ struct { // LiteralString
+ int nrunes_;
+ Rune* runes_;
+ };
+ struct { // CharClass
+ // These two could be in separate union members,
+ // but it wouldn't save any space (there are other two-word structs)
+ // and keeping them separate avoids confusion during parsing.
+ CharClass* cc_;
+ CharClassBuilder* ccb_;
+ };
+ Rune rune_; // Literal
+ void *the_union_[2]; // as big as any other element, for memset
+ };
+
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
};
diff --git a/re2/simplify.cc b/re2/simplify.cc
index 58a0973..2c9dbe3 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc
@@ -40,6 +40,7 @@
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
+ Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
@@ -57,21 +58,26 @@
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
+ subs = sub();
for (int i = 0; i < nsub_; i++)
- if (!sub_[i]->simple_)
+ if (!subs[i]->simple_)
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
+ if (ccb_ != NULL)
+ return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
- return sub_[0]->simple_;
+ subs = sub();
+ return subs[0]->simple_;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
- if (!sub_[0]->simple_)
+ subs = sub();
+ if (!subs[0]->simple_)
return false;
- switch (sub_[0]->op_) {
+ switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
@@ -168,7 +174,7 @@
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
- switch (re->op_) {
+ switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
@@ -190,8 +196,9 @@
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
+ Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
- Regexp* sub = re->sub_[i];
+ Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
@@ -206,24 +213,25 @@
re->simple_ = true;
return re->Incref();
}
- Regexp* nre = new Regexp(re->op_, re->parse_flags_);
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
+ Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
- nre->sub_[i] = child_args[i];
+ nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
- if (newsub == re->sub_[0]) {
+ if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
- Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags_);
+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
- nre->sub_[0] = newsub;
+ nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->simple_ = true;
return nre;
@@ -235,11 +243,11 @@
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
- if (newsub->op_ == kRegexpEmptyMatch)
+ if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
- if (newsub == re->sub_[0]) {
+ if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
@@ -250,9 +258,9 @@
re->parse_flags() == newsub->parse_flags())
return newsub;
- Regexp* nre = new Regexp(re->op_, re->parse_flags_);
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
- nre->sub_[0] = newsub;
+ nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
@@ -261,11 +269,11 @@
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
- if (newsub->op_ == kRegexpEmptyMatch)
+ if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
- re->parse_flags_);
+ re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
@@ -278,7 +286,7 @@
}
}
- LOG(ERROR) << "Simplify case not handled: " << re->op_;
+ LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
@@ -288,8 +296,9 @@
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
- re->sub_[0] = re1;
- re->sub_[1] = re2;
+ Regexp** subs = re->sub();
+ subs[0] = re1;
+ subs[1] = re2;
return re;
}
@@ -315,9 +324,10 @@
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
VLOG(1) << "Simplify " << min;
+ Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
- nre->sub_[i] = re->Incref();
- nre->sub_[min-1] = Regexp::Plus(re->Incref(), f);
+ nre_subs[i] = re->Incref();
+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return nre;
}
@@ -338,8 +348,9 @@
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
+ Regexp** nre_subs = nre->sub();
for (int i = 0; i < min; i++)
- nre->sub_[i] = re->Incref();
+ nre_subs[i] = re->Incref();
}
// Build and attach suffix: (x(x(x)?)?)?
@@ -370,9 +381,9 @@
// Special cases
if (cc->empty())
- return new Regexp(kRegexpNoMatch, re->parse_flags_);
+ return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
- return new Regexp(kRegexpAnyChar, re->parse_flags_);
+ return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc
index 217cef8..a3764d4 100644
--- a/re2/testing/charclass_test.cc
+++ b/re2/testing/charclass_test.cc
@@ -82,21 +82,26 @@
{ {-1} } },
};
+template<class CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
- printf("\n");
- printf("CharClass added: [%s]", desc);
- for (int k = 0; t->add[k].lo >= 0; k++)
- printf(" %d-%d", t->add[k].lo, t->add[k].hi);
- printf("\n");
- if (t->remove >= 0)
- printf("Removed > %d\n", t->remove);
- printf("\twant:");
- for (int k = 0; t->final[k].lo >= 0; k++)
- printf(" %d-%d", t->final[k].lo, t->final[k].hi);
- printf("\n");
+ if (t == NULL) {
+ printf("\t%s:", desc);
+ } else {
+ printf("\n");
+ printf("CharClass added: [%s]", desc);
+ for (int k = 0; t->add[k].lo >= 0; k++)
+ printf(" %d-%d", t->add[k].lo, t->add[k].hi);
+ printf("\n");
+ if (t->remove >= 0)
+ printf("Removed > %d\n", t->remove);
+ printf("\twant:");
+ for (int k = 0; t->final[k].lo >= 0; k++)
+ printf(" %d-%d", t->final[k].lo, t->final[k].hi);
+ printf("\n");
+ printf("\thave:");
+ }
- printf("\thave:");
- for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+ for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
printf(" %d-%d", it->lo, it->hi);
printf("\n");
}
@@ -108,8 +113,29 @@
return false;
}
+// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
+
+CharClass* Negate(CharClass *cc) {
+ return cc->Negate();
+}
+
+void Delete(CharClass* cc) {
+ cc->Delete();
+}
+
+CharClassBuilder* Negate(CharClassBuilder* cc) {
+ CharClassBuilder* ncc = cc->Copy();
+ ncc->Negate();
+ return ncc;
+}
+
+void Delete(CharClassBuilder* cc) {
+ delete cc;
+}
+
+template<class CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
- CharClass::iterator it = cc->begin();
+ typename CharClass::iterator it = cc->begin();
int size = 0;
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
if (it == cc->end() ||
@@ -141,46 +167,55 @@
}
}
- CharClass* ncc = cc->Copy();
- ncc->Negate();
+ CharClass* ncc = Negate(cc);
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
+ Broke("ncc", NULL, ncc);
printf("want ncc contains(%d)!=%d, got %d\n",
j, ShouldContain(t, j), ncc->Contains(j));
- delete ncc;
+ Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
+ Broke("ncc", NULL, ncc);
printf("ncc size should be %d is %d\n",
Runemax+1 - cc->size(), ncc->size());
- delete ncc;
+ Delete(ncc);
return false;
}
}
- delete ncc;
+ Delete(ncc);
return true;
}
-TEST(TestCharClass, Adds) {
+TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
for (int i = 0; i < arraysize(tests); i++) {
- CharClass cc;
+ CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
- cc.AddRange(t->add[j].lo, t->add[j].hi);
+ ccb.AddRange(t->add[j].lo, t->add[j].hi);
if (t->remove >= 0)
- cc.RemoveAbove(t->remove);
- if (!CorrectCC(&cc, t, "before copy"))
+ ccb.RemoveAbove(t->remove);
+ if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
nfail++;
+ CharClass* cc = ccb.GetCharClass();
+ if (!CorrectCC(cc, t, "before copy (CharClass)"))
+ nfail++;
+ cc->Delete();
- CharClass *cc1 = cc.Copy();
- if (!CorrectCC(cc1, t, "after copy"))
+ CharClassBuilder *ccb1 = ccb.Copy();
+ if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
nfail++;
- delete cc1;
+ cc = ccb.GetCharClass();
+ if (!CorrectCC(cc, t, "after copy (CharClass)"))
+ nfail++;
+ cc->Delete();
+ delete ccb1;
}
EXPECT_EQ(nfail, 0);
}
diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc
new file mode 100644
index 0000000..6685c37
--- /dev/null
+++ b/re2/testing/regexp_test.cc
@@ -0,0 +1,43 @@
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test parse.cc, dump.cc, and tostring.cc.
+
+#include <string>
+#include <vector>
+#include "util/test.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+// Test that overflowed ref counts work.
+TEST(Regexp, BigRef) {
+ Regexp* re;
+
+ re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+ for (int i = 0; i < 100000; i++)
+ re->Incref();
+ for (int i = 0; i < 100000; i++)
+ re->Decref();
+ CHECK_EQ(re->Ref(), 1);
+ re->Decref();
+}
+
+// Test that very large Concats work.
+// Depends on overflowed ref counts working.
+TEST(Regexp, BigConcat) {
+ Regexp* x;
+ x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
+ vector<Regexp*> v(90000, x); // ToString bails out at 100000
+ for (int i = 0; i < v.size(); i++)
+ x->Incref();
+ CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref();
+ Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags);
+ CHECK_EQ(re->ToString(), string(v.size(), 'x'));
+ re->Decref();
+ CHECK_EQ(x->Ref(), 1) << x->Ref();
+ x->Decref();
+}
+
+} // namespace re2
diff --git a/re2/tostring.cc b/re2/tostring.cc
index 9792eac..a4aabbe 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc
@@ -3,7 +3,7 @@
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
-// Tested by parse_unittest.cc
+// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
@@ -250,8 +250,6 @@
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
- // Simplify is supposed to rewrite this to NoMatch.
- LOG(DFATAL) << "Empty char class in ToString.";
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
@@ -260,14 +258,13 @@
// non-character 0xFFFE.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE)) {
- cc = cc->Copy();
- cc->Negate();
+ cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
- delete cc;
+ cc->Delete();
t_->append("]");
break;
}
diff --git a/re2/walker-inl.h b/re2/walker-inl.h
index 2d3158e..4d2045f 100644
--- a/re2/walker-inl.h
+++ b/re2/walker-inl.h
@@ -191,12 +191,13 @@
}
default: {
if (re->nsub_ > 0) {
+ Regexp** sub = re->sub();
if (s->n < re->nsub_) {
- if (use_copy && s->n > 0 && re->sub_[s->n - 1] == re->sub_[s->n]) {
+ if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
- stack_->push(WalkState<T>(re->sub_[s->n], s->pre_arg));
+ stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}