| // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Regular expression representation. |
| // Tested by parse_test.cc |
| |
| #include "re2/regexp.h" |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <algorithm> |
| #include <map> |
| #include <mutex> |
| #include <string> |
| #include <vector> |
| |
| #include "util/util.h" |
| #include "util/logging.h" |
| #include "util/mutex.h" |
| #include "util/utf.h" |
| #include "re2/stringpiece.h" |
| #include "re2/walker-inl.h" |
| |
| namespace re2 { |
| |
| // Constructor. Allocates vectors as appropriate for operator. |
| Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
| : op_(static_cast<uint8_t>(op)), |
| simple_(false), |
| parse_flags_(static_cast<uint16_t>(parse_flags)), |
| ref_(1), |
| nsub_(0), |
| down_(NULL) { |
| subone_ = NULL; |
| memset(the_union_, 0, sizeof the_union_); |
| } |
| |
| // Destructor. Assumes already cleaned up children. |
| // Private: use Decref() instead of delete to destroy Regexps. |
| // Can't call Decref on the sub-Regexps here because |
| // that could cause arbitrarily deep recursion, so |
| // required Decref() to have handled them for us. |
| Regexp::~Regexp() { |
| if (nsub_ > 0) |
| LOG(DFATAL) << "Regexp not destroyed."; |
| |
| switch (op_) { |
| default: |
| break; |
| case kRegexpCapture: |
| delete name_; |
| break; |
| case kRegexpLiteralString: |
| delete[] runes_; |
| break; |
| case kRegexpCharClass: |
| if (cc_) |
| cc_->Delete(); |
| delete ccb_; |
| break; |
| } |
| } |
| |
| // If it's possible to destroy this regexp without recurring, |
| // do so and return true. Else return false. |
| bool Regexp::QuickDestroy() { |
| if (nsub_ == 0) { |
| delete this; |
| return true; |
| } |
| return false; |
| } |
| |
| // Lazily allocated. |
| static Mutex* ref_mutex; |
| static std::map<Regexp*, int>* ref_map; |
| |
| int Regexp::Ref() { |
| if (ref_ < kMaxRef) |
| return ref_; |
| |
| MutexLock l(ref_mutex); |
| return (*ref_map)[this]; |
| } |
| |
| // Increments reference count, returns object as convenience. |
| Regexp* Regexp::Incref() { |
| if (ref_ >= kMaxRef-1) { |
| static std::once_flag ref_once; |
| std::call_once(ref_once, []() { |
| ref_mutex = new Mutex; |
| ref_map = new std::map<Regexp*, int>; |
| }); |
| |
| // Store ref count in overflow map. |
| MutexLock l(ref_mutex); |
| if (ref_ == kMaxRef) { |
| // already overflowed |
| (*ref_map)[this]++; |
| } else { |
| // overflowing now |
| (*ref_map)[this] = kMaxRef; |
| ref_ = kMaxRef; |
| } |
| return this; |
| } |
| |
| ref_++; |
| return this; |
| } |
| |
| // Decrements reference count and deletes this object if count reaches 0. |
| void Regexp::Decref() { |
| if (ref_ == kMaxRef) { |
| // Ref count is stored in overflow map. |
| MutexLock l(ref_mutex); |
| int r = (*ref_map)[this] - 1; |
| if (r < kMaxRef) { |
| ref_ = static_cast<uint16_t>(r); |
| ref_map->erase(this); |
| } else { |
| (*ref_map)[this] = r; |
| } |
| return; |
| } |
| ref_--; |
| if (ref_ == 0) |
| Destroy(); |
| } |
| |
| // Deletes this object; ref count has count reached 0. |
| void Regexp::Destroy() { |
| if (QuickDestroy()) |
| return; |
| |
| // Handle recursive Destroy with explicit stack |
| // to avoid arbitrarily deep recursion on process stack [sigh]. |
| down_ = NULL; |
| Regexp* stack = this; |
| while (stack != NULL) { |
| Regexp* re = stack; |
| stack = re->down_; |
| if (re->ref_ != 0) |
| LOG(DFATAL) << "Bad reference count " << re->ref_; |
| if (re->nsub_ > 0) { |
| Regexp** subs = re->sub(); |
| for (int i = 0; i < re->nsub_; i++) { |
| Regexp* sub = subs[i]; |
| if (sub == NULL) |
| continue; |
| if (sub->ref_ == kMaxRef) |
| sub->Decref(); |
| else |
| --sub->ref_; |
| if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
| sub->down_ = stack; |
| stack = sub; |
| } |
| } |
| if (re->nsub_ > 1) |
| delete[] subs; |
| re->nsub_ = 0; |
| } |
| delete re; |
| } |
| } |
| |
| void Regexp::AddRuneToString(Rune r) { |
| DCHECK(op_ == kRegexpLiteralString); |
| if (nrunes_ == 0) { |
| // start with 8 |
| runes_ = new Rune[8]; |
| } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
| // double on powers of two |
| Rune *old = runes_; |
| runes_ = new Rune[nrunes_ * 2]; |
| for (int i = 0; i < nrunes_; i++) |
| runes_[i] = old[i]; |
| delete[] old; |
| } |
| |
| runes_[nrunes_++] = r; |
| } |
| |
| Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
| Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
| re->match_id_ = match_id; |
| return re; |
| } |
| |
| Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { |
| // Squash **, ++ and ??. |
| if (op == sub->op() && flags == sub->parse_flags()) |
| return sub; |
| |
| // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because |
| // op is Star/Plus/Quest, we just have to check that sub->op() is too. |
| if ((sub->op() == kRegexpStar || |
| sub->op() == kRegexpPlus || |
| sub->op() == kRegexpQuest) && |
| flags == sub->parse_flags()) { |
| // If sub is Star, no need to rewrite it. |
| if (sub->op() == kRegexpStar) |
| return sub; |
| |
| // Rewrite sub to Star. |
| Regexp* re = new Regexp(kRegexpStar, flags); |
| re->AllocSub(1); |
| re->sub()[0] = sub->sub()[0]->Incref(); |
| sub->Decref(); // We didn't consume the reference after all. |
| return re; |
| } |
| |
| Regexp* re = new Regexp(op, flags); |
| re->AllocSub(1); |
| re->sub()[0] = sub; |
| return re; |
| } |
| |
| Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
| return StarPlusOrQuest(kRegexpPlus, sub, flags); |
| } |
| |
| Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
| return StarPlusOrQuest(kRegexpStar, sub, flags); |
| } |
| |
| Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
| return StarPlusOrQuest(kRegexpQuest, sub, flags); |
| } |
| |
| Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
| ParseFlags flags, bool can_factor) { |
| if (nsub == 1) |
| return sub[0]; |
| |
| if (nsub == 0) { |
| if (op == kRegexpAlternate) |
| return new Regexp(kRegexpNoMatch, flags); |
| else |
| return new Regexp(kRegexpEmptyMatch, flags); |
| } |
| |
| Regexp** subcopy = NULL; |
| if (op == kRegexpAlternate && can_factor) { |
| // Going to edit sub; make a copy so we don't step on caller. |
| subcopy = new Regexp*[nsub]; |
| memmove(subcopy, sub, nsub * sizeof sub[0]); |
| sub = subcopy; |
| nsub = FactorAlternation(sub, nsub, flags); |
| if (nsub == 1) { |
| Regexp* re = sub[0]; |
| delete[] subcopy; |
| return re; |
| } |
| } |
| |
| if (nsub > kMaxNsub) { |
| // Too many subexpressions to fit in a single Regexp. |
| // Make a two-level tree. Two levels gets us to 65535^2. |
| int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
| Regexp* re = new Regexp(op, flags); |
| re->AllocSub(nbigsub); |
| Regexp** subs = re->sub(); |
| for (int i = 0; i < nbigsub - 1; i++) |
| subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
| subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
| nsub - (nbigsub-1)*kMaxNsub, flags, |
| false); |
| delete[] subcopy; |
| return re; |
| } |
| |
| Regexp* re = new Regexp(op, flags); |
| re->AllocSub(nsub); |
| Regexp** subs = re->sub(); |
| for (int i = 0; i < nsub; i++) |
| subs[i] = sub[i]; |
| |
| delete[] subcopy; |
| return re; |
| } |
| |
| Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
| return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
| } |
| |
| Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
| return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
| } |
| |
| Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
| return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
| } |
| |
| Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
| Regexp* re = new Regexp(kRegexpCapture, flags); |
| re->AllocSub(1); |
| re->sub()[0] = sub; |
| re->cap_ = cap; |
| return re; |
| } |
| |
| Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
| Regexp* re = new Regexp(kRegexpRepeat, flags); |
| re->AllocSub(1); |
| re->sub()[0] = sub; |
| re->min_ = min; |
| re->max_ = max; |
| return re; |
| } |
| |
| Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
| Regexp* re = new Regexp(kRegexpLiteral, flags); |
| re->rune_ = rune; |
| return re; |
| } |
| |
| Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
| if (nrunes <= 0) |
| return new Regexp(kRegexpEmptyMatch, flags); |
| if (nrunes == 1) |
| return NewLiteral(runes[0], flags); |
| Regexp* re = new Regexp(kRegexpLiteralString, flags); |
| for (int i = 0; i < nrunes; i++) |
| re->AddRuneToString(runes[i]); |
| return re; |
| } |
| |
| Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
| Regexp* re = new Regexp(kRegexpCharClass, flags); |
| re->cc_ = cc; |
| return re; |
| } |
| |
| void Regexp::Swap(Regexp* that) { |
| // Regexp is not trivially copyable, so we cannot freely copy it with |
| // memmove(3), but swapping objects like so is safe for our purposes. |
| char tmp[sizeof *this]; |
| void* vthis = reinterpret_cast<void*>(this); |
| void* vthat = reinterpret_cast<void*>(that); |
| memmove(tmp, vthis, sizeof *this); |
| memmove(vthis, vthat, sizeof *this); |
| memmove(vthat, tmp, sizeof *this); |
| } |
| |
| // Tests equality of all top-level structure but not subregexps. |
| static bool TopEqual(Regexp* a, Regexp* b) { |
| if (a->op() != b->op()) |
| return false; |
| |
| switch (a->op()) { |
| case kRegexpNoMatch: |
| case kRegexpEmptyMatch: |
| case kRegexpAnyChar: |
| case kRegexpAnyByte: |
| case kRegexpBeginLine: |
| case kRegexpEndLine: |
| case kRegexpWordBoundary: |
| case kRegexpNoWordBoundary: |
| case kRegexpBeginText: |
| return true; |
| |
| case kRegexpEndText: |
| // The parse flags remember whether it's \z or (?-m:$), |
| // which matters when testing against PCRE. |
| return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
| |
| case kRegexpLiteral: |
| return a->rune() == b->rune() && |
| ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
| |
| case kRegexpLiteralString: |
| return a->nrunes() == b->nrunes() && |
| ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
| memcmp(a->runes(), b->runes(), |
| a->nrunes() * sizeof a->runes()[0]) == 0; |
| |
| case kRegexpAlternate: |
| case kRegexpConcat: |
| return a->nsub() == b->nsub(); |
| |
| case kRegexpStar: |
| case kRegexpPlus: |
| case kRegexpQuest: |
| return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
| |
| case kRegexpRepeat: |
| return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
| a->min() == b->min() && |
| a->max() == b->max(); |
| |
| case kRegexpCapture: |
| return a->cap() == b->cap() && a->name() == b->name(); |
| |
| case kRegexpHaveMatch: |
| return a->match_id() == b->match_id(); |
| |
| case kRegexpCharClass: { |
| CharClass* acc = a->cc(); |
| CharClass* bcc = b->cc(); |
| return acc->size() == bcc->size() && |
| acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
| memcmp(acc->begin(), bcc->begin(), |
| (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
| } |
| } |
| |
| LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
| return 0; |
| } |
| |
| bool Regexp::Equal(Regexp* a, Regexp* b) { |
| if (a == NULL || b == NULL) |
| return a == b; |
| |
| if (!TopEqual(a, b)) |
| return false; |
| |
| // Fast path: |
| // return without allocating vector if there are no subregexps. |
| switch (a->op()) { |
| case kRegexpAlternate: |
| case kRegexpConcat: |
| case kRegexpStar: |
| case kRegexpPlus: |
| case kRegexpQuest: |
| case kRegexpRepeat: |
| case kRegexpCapture: |
| break; |
| |
| default: |
| return true; |
| } |
| |
| // Committed to doing real work. |
| // The stack (vector) has pairs of regexps waiting to |
| // be compared. The regexps are only equal if |
| // all the pairs end up being equal. |
| std::vector<Regexp*> stk; |
| |
| for (;;) { |
| // Invariant: TopEqual(a, b) == true. |
| Regexp* a2; |
| Regexp* b2; |
| switch (a->op()) { |
| default: |
| break; |
| case kRegexpAlternate: |
| case kRegexpConcat: |
| for (int i = 0; i < a->nsub(); i++) { |
| a2 = a->sub()[i]; |
| b2 = b->sub()[i]; |
| if (!TopEqual(a2, b2)) |
| return false; |
| stk.push_back(a2); |
| stk.push_back(b2); |
| } |
| break; |
| |
| case kRegexpStar: |
| case kRegexpPlus: |
| case kRegexpQuest: |
| case kRegexpRepeat: |
| case kRegexpCapture: |
| a2 = a->sub()[0]; |
| b2 = b->sub()[0]; |
| if (!TopEqual(a2, b2)) |
| return false; |
| // Really: |
| // stk.push_back(a2); |
| // stk.push_back(b2); |
| // break; |
| // but faster to assign directly and loop. |
| a = a2; |
| b = b2; |
| continue; |
| } |
| |
| size_t n = stk.size(); |
| if (n == 0) |
| break; |
| |
| DCHECK_GE(n, 2); |
| a = stk[n-2]; |
| b = stk[n-1]; |
| stk.resize(n-2); |
| } |
| |
| return true; |
| } |
| |
| // Keep in sync with enum RegexpStatusCode in regexp.h |
| static const char *kErrorStrings[] = { |
| "no error", |
| "unexpected error", |
| "invalid escape sequence", |
| "invalid character class", |
| "invalid character class range", |
| "missing ]", |
| "missing )", |
| "trailing \\", |
| "no argument for repetition operator", |
| "invalid repetition size", |
| "bad repetition operator", |
| "invalid perl operator", |
| "invalid UTF-8", |
| "invalid named capture group", |
| }; |
| |
| string RegexpStatus::CodeText(enum RegexpStatusCode code) { |
| if (code < 0 || code >= arraysize(kErrorStrings)) |
| code = kRegexpInternalError; |
| return kErrorStrings[code]; |
| } |
| |
| string RegexpStatus::Text() const { |
| if (error_arg_.empty()) |
| return CodeText(code_); |
| string s; |
| s.append(CodeText(code_)); |
| s.append(": "); |
| s.append(error_arg_.data(), error_arg_.size()); |
| return s; |
| } |
| |
| void RegexpStatus::Copy(const RegexpStatus& status) { |
| code_ = status.code_; |
| error_arg_ = status.error_arg_; |
| } |
| |
| typedef int Ignored; // Walker<void> doesn't exist |
| |
| // Walker subclass to count capturing parens in regexp. |
| class NumCapturesWalker : public Regexp::Walker<Ignored> { |
| public: |
| NumCapturesWalker() : ncapture_(0) {} |
| int ncapture() { return ncapture_; } |
| |
| virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| if (re->op() == kRegexpCapture) |
| ncapture_++; |
| return ignored; |
| } |
| virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| // Should never be called: we use Walk not WalkExponential. |
| LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; |
| return ignored; |
| } |
| |
| private: |
| int ncapture_; |
| |
| NumCapturesWalker(const NumCapturesWalker&) = delete; |
| NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; |
| }; |
| |
| int Regexp::NumCaptures() { |
| NumCapturesWalker w; |
| w.Walk(this, 0); |
| return w.ncapture(); |
| } |
| |
| // Walker class to build map of named capture groups and their indices. |
| class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
| public: |
| NamedCapturesWalker() : map_(NULL) {} |
| ~NamedCapturesWalker() { delete map_; } |
| |
| std::map<string, int>* TakeMap() { |
| std::map<string, int>* m = map_; |
| map_ = NULL; |
| return m; |
| } |
| |
| Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| if (re->op() == kRegexpCapture && re->name() != NULL) { |
| // Allocate map once we find a name. |
| if (map_ == NULL) |
| map_ = new std::map<string, int>; |
| |
| // Record first occurrence of each name. |
| // (The rule is that if you have the same name |
| // multiple times, only the leftmost one counts.) |
| if (map_->find(*re->name()) == map_->end()) |
| (*map_)[*re->name()] = re->cap(); |
| } |
| return ignored; |
| } |
| |
| virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| // Should never be called: we use Walk not WalkExponential. |
| LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; |
| return ignored; |
| } |
| |
| private: |
| std::map<string, int>* map_; |
| |
| NamedCapturesWalker(const NamedCapturesWalker&) = delete; |
| NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; |
| }; |
| |
| std::map<string, int>* Regexp::NamedCaptures() { |
| NamedCapturesWalker w; |
| w.Walk(this, 0); |
| return w.TakeMap(); |
| } |
| |
| // Walker class to build map from capture group indices to their names. |
| class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
| public: |
| CaptureNamesWalker() : map_(NULL) {} |
| ~CaptureNamesWalker() { delete map_; } |
| |
| std::map<int, string>* TakeMap() { |
| std::map<int, string>* m = map_; |
| map_ = NULL; |
| return m; |
| } |
| |
| Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| if (re->op() == kRegexpCapture && re->name() != NULL) { |
| // Allocate map once we find a name. |
| if (map_ == NULL) |
| map_ = new std::map<int, string>; |
| |
| (*map_)[re->cap()] = *re->name(); |
| } |
| return ignored; |
| } |
| |
| virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| // Should never be called: we use Walk not WalkExponential. |
| LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; |
| return ignored; |
| } |
| |
| private: |
| std::map<int, string>* map_; |
| |
| CaptureNamesWalker(const CaptureNamesWalker&) = delete; |
| CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; |
| }; |
| |
| std::map<int, string>* Regexp::CaptureNames() { |
| CaptureNamesWalker w; |
| w.Walk(this, 0); |
| return w.TakeMap(); |
| } |
| |
| // Determines whether regexp matches must be anchored |
| // with a fixed string prefix. If so, returns the prefix and |
| // the regexp that remains after the prefix. The prefix might |
| // be ASCII case-insensitive. |
| bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) { |
| // No need for a walker: the regexp must be of the form |
| // 1. some number of ^ anchors |
| // 2. a literal char or string |
| // 3. the rest |
| prefix->clear(); |
| *foldcase = false; |
| *suffix = NULL; |
| if (op_ != kRegexpConcat) |
| return false; |
| |
| // Some number of anchors, then a literal or concatenation. |
| int i = 0; |
| Regexp** sub = this->sub(); |
| while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) |
| i++; |
| if (i == 0 || i >= nsub_) |
| return false; |
| |
| Regexp* re = sub[i]; |
| switch (re->op_) { |
| default: |
| return false; |
| |
| case kRegexpLiteralString: |
| // Convert to string in proper encoding. |
| if (re->parse_flags() & Latin1) { |
| prefix->resize(re->nrunes_); |
| for (int j = 0; j < re->nrunes_; j++) |
| (*prefix)[j] = static_cast<char>(re->runes_[j]); |
| } else { |
| // Convert to UTF-8 in place. |
| // Assume worst-case space and then trim. |
| prefix->resize(re->nrunes_ * UTFmax); |
| char *p = &(*prefix)[0]; |
| for (int j = 0; j < re->nrunes_; j++) { |
| Rune r = re->runes_[j]; |
| if (r < Runeself) |
| *p++ = static_cast<char>(r); |
| else |
| p += runetochar(p, &r); |
| } |
| prefix->resize(p - &(*prefix)[0]); |
| } |
| break; |
| |
| case kRegexpLiteral: |
| if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { |
| prefix->append(1, static_cast<char>(re->rune_)); |
| } else { |
| char buf[UTFmax]; |
| prefix->append(buf, runetochar(buf, &re->rune_)); |
| } |
| break; |
| } |
| *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; |
| i++; |
| |
| // The rest. |
| if (i < nsub_) { |
| for (int j = i; j < nsub_; j++) |
| sub[j]->Incref(); |
| re = Concat(sub + i, nsub_ - i, parse_flags()); |
| } else { |
| re = new Regexp(kRegexpEmptyMatch, parse_flags()); |
| } |
| *suffix = re; |
| return true; |
| } |
| |
| // Character class builder is a balanced binary tree (STL set) |
| // containing non-overlapping, non-abutting RuneRanges. |
| // The less-than operator used in the tree treats two |
| // ranges as equal if they overlap at all, so that |
| // lookups for a particular Rune are possible. |
| |
| CharClassBuilder::CharClassBuilder() { |
| nrunes_ = 0; |
| upper_ = 0; |
| lower_ = 0; |
| } |
| |
| // Add lo-hi to the class; return whether class got bigger. |
| bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
| if (hi < lo) |
| return false; |
| |
| if (lo <= 'z' && hi >= 'A') { |
| // Overlaps some alpha, maybe not all. |
| // Update bitmaps telling which ASCII letters are in the set. |
| Rune lo1 = std::max<Rune>(lo, 'A'); |
| Rune hi1 = std::min<Rune>(hi, 'Z'); |
| if (lo1 <= hi1) |
| upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
| |
| lo1 = std::max<Rune>(lo, 'a'); |
| hi1 = std::min<Rune>(hi, 'z'); |
| if (lo1 <= hi1) |
| lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
| } |
| |
| { // Check whether lo, hi is already in the class. |
| iterator it = ranges_.find(RuneRange(lo, lo)); |
| if (it != end() && it->lo <= lo && hi <= it->hi) |
| return false; |
| } |
| |
| // Look for a range abutting lo on the left. |
| // If it exists, take it out and increase our range. |
| if (lo > 0) { |
| iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
| if (it != end()) { |
| lo = it->lo; |
| if (it->hi > hi) |
| hi = it->hi; |
| nrunes_ -= it->hi - it->lo + 1; |
| ranges_.erase(it); |
| } |
| } |
| |
| // Look for a range abutting hi on the right. |
| // If it exists, take it out and increase our range. |
| if (hi < Runemax) { |
| iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
| if (it != end()) { |
| hi = it->hi; |
| nrunes_ -= it->hi - it->lo + 1; |
| ranges_.erase(it); |
| } |
| } |
| |
| // Look for ranges between lo and hi. Take them out. |
| // This is only safe because the set has no overlapping ranges. |
| // We've already removed any ranges abutting lo and hi, so |
| // any that overlap [lo, hi] must be contained within it. |
| for (;;) { |
| iterator it = ranges_.find(RuneRange(lo, hi)); |
| if (it == end()) |
| break; |
| nrunes_ -= it->hi - it->lo + 1; |
| ranges_.erase(it); |
| } |
| |
| // Finally, add [lo, hi]. |
| nrunes_ += hi - lo + 1; |
| ranges_.insert(RuneRange(lo, hi)); |
| return true; |
| } |
| |
| void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
| for (iterator it = cc->begin(); it != cc->end(); ++it) |
| AddRange(it->lo, it->hi); |
| } |
| |
| bool CharClassBuilder::Contains(Rune r) { |
| return ranges_.find(RuneRange(r, r)) != end(); |
| } |
| |
| // Does the character class behave the same on A-Z as on a-z? |
| bool CharClassBuilder::FoldsASCII() { |
| return ((upper_ ^ lower_) & AlphaMask) == 0; |
| } |
| |
| CharClassBuilder* CharClassBuilder::Copy() { |
| CharClassBuilder* cc = new CharClassBuilder; |
| for (iterator it = begin(); it != end(); ++it) |
| cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
| cc->upper_ = upper_; |
| cc->lower_ = lower_; |
| cc->nrunes_ = nrunes_; |
| return cc; |
| } |
| |
| |
| |
| void CharClassBuilder::RemoveAbove(Rune r) { |
| if (r >= Runemax) |
| return; |
| |
| if (r < 'z') { |
| if (r < 'a') |
| lower_ = 0; |
| else |
| lower_ &= AlphaMask >> ('z' - r); |
| } |
| |
| if (r < 'Z') { |
| if (r < 'A') |
| upper_ = 0; |
| else |
| upper_ &= AlphaMask >> ('Z' - r); |
| } |
| |
| for (;;) { |
| |
| iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
| if (it == end()) |
| break; |
| RuneRange rr = *it; |
| ranges_.erase(it); |
| nrunes_ -= rr.hi - rr.lo + 1; |
| if (rr.lo <= r) { |
| rr.hi = r; |
| ranges_.insert(rr); |
| nrunes_ += rr.hi - rr.lo + 1; |
| } |
| } |
| } |
| |
| void CharClassBuilder::Negate() { |
| // Build up negation and then copy in. |
| // Could edit ranges in place, but C++ won't let me. |
| std::vector<RuneRange> v; |
| v.reserve(ranges_.size() + 1); |
| |
| // In negation, first range begins at 0, unless |
| // the current class begins at 0. |
| iterator it = begin(); |
| if (it == end()) { |
| v.push_back(RuneRange(0, Runemax)); |
| } else { |
| int nextlo = 0; |
| if (it->lo == 0) { |
| nextlo = it->hi + 1; |
| ++it; |
| } |
| for (; it != end(); ++it) { |
| v.push_back(RuneRange(nextlo, it->lo - 1)); |
| nextlo = it->hi + 1; |
| } |
| if (nextlo <= Runemax) |
| v.push_back(RuneRange(nextlo, Runemax)); |
| } |
| |
| ranges_.clear(); |
| for (size_t i = 0; i < v.size(); i++) |
| ranges_.insert(v[i]); |
| |
| upper_ = AlphaMask & ~upper_; |
| lower_ = AlphaMask & ~lower_; |
| nrunes_ = Runemax+1 - nrunes_; |
| } |
| |
| // Character class is a sorted list of ranges. |
| // The ranges are allocated in the same block as the header, |
| // necessitating a special allocator and Delete method. |
| |
| CharClass* CharClass::New(int maxranges) { |
| CharClass* cc; |
| uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
| cc = reinterpret_cast<CharClass*>(data); |
| cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
| cc->nranges_ = 0; |
| cc->folds_ascii_ = false; |
| cc->nrunes_ = 0; |
| return cc; |
| } |
| |
| void CharClass::Delete() { |
| uint8_t* data = reinterpret_cast<uint8_t*>(this); |
| delete[] data; |
| } |
| |
| CharClass* CharClass::Negate() { |
| CharClass* cc = CharClass::New(nranges_+1); |
| cc->folds_ascii_ = folds_ascii_; |
| cc->nrunes_ = Runemax + 1 - nrunes_; |
| int n = 0; |
| int nextlo = 0; |
| for (CharClass::iterator it = begin(); it != end(); ++it) { |
| if (it->lo == nextlo) { |
| nextlo = it->hi + 1; |
| } else { |
| cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
| nextlo = it->hi + 1; |
| } |
| } |
| if (nextlo <= Runemax) |
| cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
| cc->nranges_ = n; |
| return cc; |
| } |
| |
| bool CharClass::Contains(Rune r) { |
| RuneRange* rr = ranges_; |
| int n = nranges_; |
| while (n > 0) { |
| int m = n/2; |
| if (rr[m].hi < r) { |
| rr += m+1; |
| n -= m+1; |
| } else if (r < rr[m].lo) { |
| n = m; |
| } else { // rr[m].lo <= r && r <= rr[m].hi |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| CharClass* CharClassBuilder::GetCharClass() { |
| CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); |
| int n = 0; |
| for (iterator it = begin(); it != end(); ++it) |
| cc->ranges_[n++] = *it; |
| cc->nranges_ = n; |
| DCHECK_LE(n, static_cast<int>(ranges_.size())); |
| cc->nrunes_ = nrunes_; |
| cc->folds_ascii_ = FoldsASCII(); |
| return cc; |
| } |
| |
| } // namespace re2 |