Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Regular expression representation. |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 6 | // Tested by parse_test.cc |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 7 | |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 8 | #include "re2/regexp.h" |
Paul Wankadia | 0029946 | 2016-08-02 20:26:18 +1000 | [diff] [blame] | 9 | |
| 10 | #include <stddef.h> |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 11 | #include <stdint.h> |
Paul Wankadia | 0029946 | 2016-08-02 20:26:18 +1000 | [diff] [blame] | 12 | #include <string.h> |
| 13 | #include <algorithm> |
| 14 | #include <map> |
| 15 | #include <mutex> |
| 16 | #include <string> |
| 17 | #include <vector> |
| 18 | |
| 19 | #include "util/util.h" |
Paul Wankadia | cc382ec | 2016-08-17 01:00:16 +1000 | [diff] [blame] | 20 | #include "util/logging.h" |
| 21 | #include "util/mutex.h" |
| 22 | #include "util/utf.h" |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 23 | #include "re2/stringpiece.h" |
| 24 | #include "re2/walker-inl.h" |
| 25 | |
| 26 | namespace re2 { |
| 27 | |
| 28 | // Constructor. Allocates vectors as appropriate for operator. |
| 29 | Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 30 | : op_(static_cast<uint8_t>(op)), |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 31 | simple_(false), |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 32 | parse_flags_(static_cast<uint16_t>(parse_flags)), |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 33 | ref_(1), |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 34 | nsub_(0), |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 35 | down_(NULL) { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 36 | subone_ = NULL; |
| 37 | memset(the_union_, 0, sizeof the_union_); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 38 | } |
| 39 | |
| 40 | // Destructor. Assumes already cleaned up children. |
| 41 | // Private: use Decref() instead of delete to destroy Regexps. |
| 42 | // Can't call Decref on the sub-Regexps here because |
| 43 | // that could cause arbitrarily deep recursion, so |
| 44 | // required Decref() to have handled them for us. |
| 45 | Regexp::~Regexp() { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 46 | if (nsub_ > 0) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 47 | LOG(DFATAL) << "Regexp not destroyed."; |
| 48 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 49 | switch (op_) { |
| 50 | default: |
| 51 | break; |
| 52 | case kRegexpCapture: |
| 53 | delete name_; |
| 54 | break; |
| 55 | case kRegexpLiteralString: |
| 56 | delete[] runes_; |
| 57 | break; |
| 58 | case kRegexpCharClass: |
Russ Cox | 499ef7e | 2014-12-18 12:24:33 -0500 | [diff] [blame] | 59 | if (cc_) |
| 60 | cc_->Delete(); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 61 | delete ccb_; |
| 62 | break; |
| 63 | } |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 64 | } |
| 65 | |
| 66 | // If it's possible to destroy this regexp without recurring, |
| 67 | // do so and return true. Else return false. |
| 68 | bool Regexp::QuickDestroy() { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 69 | if (nsub_ == 0) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 70 | delete this; |
| 71 | return true; |
| 72 | } |
| 73 | return false; |
| 74 | } |
| 75 | |
Paul Wankadia | 88d5583 | 2016-03-02 01:32:05 +1100 | [diff] [blame] | 76 | // Lazily allocated. |
Paul Wankadia | 55b0708 | 2016-03-03 14:25:27 +1100 | [diff] [blame] | 77 | static Mutex* ref_mutex; |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 78 | static std::map<Regexp*, int>* ref_map; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 79 | |
| 80 | int Regexp::Ref() { |
| 81 | if (ref_ < kMaxRef) |
| 82 | return ref_; |
Russ Cox | 9760347 | 2010-07-15 18:26:01 -0700 | [diff] [blame] | 83 | |
Paul Wankadia | 88d5583 | 2016-03-02 01:32:05 +1100 | [diff] [blame] | 84 | MutexLock l(ref_mutex); |
| 85 | return (*ref_map)[this]; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 86 | } |
| 87 | |
| 88 | // Increments reference count, returns object as convenience. |
| 89 | Regexp* Regexp::Incref() { |
| 90 | if (ref_ >= kMaxRef-1) { |
Paul Wankadia | 88d5583 | 2016-03-02 01:32:05 +1100 | [diff] [blame] | 91 | static std::once_flag ref_once; |
| 92 | std::call_once(ref_once, []() { |
| 93 | ref_mutex = new Mutex; |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 94 | ref_map = new std::map<Regexp*, int>; |
Paul Wankadia | 88d5583 | 2016-03-02 01:32:05 +1100 | [diff] [blame] | 95 | }); |
| 96 | |
| 97 | // Store ref count in overflow map. |
| 98 | MutexLock l(ref_mutex); |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 99 | if (ref_ == kMaxRef) { |
| 100 | // already overflowed |
| 101 | (*ref_map)[this]++; |
| 102 | } else { |
| 103 | // overflowing now |
| 104 | (*ref_map)[this] = kMaxRef; |
| 105 | ref_ = kMaxRef; |
| 106 | } |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 107 | return this; |
| 108 | } |
| 109 | |
| 110 | ref_++; |
| 111 | return this; |
| 112 | } |
| 113 | |
| 114 | // Decrements reference count and deletes this object if count reaches 0. |
| 115 | void Regexp::Decref() { |
| 116 | if (ref_ == kMaxRef) { |
| 117 | // Ref count is stored in overflow map. |
Paul Wankadia | 88d5583 | 2016-03-02 01:32:05 +1100 | [diff] [blame] | 118 | MutexLock l(ref_mutex); |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 119 | int r = (*ref_map)[this] - 1; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 120 | if (r < kMaxRef) { |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 121 | ref_ = static_cast<uint16_t>(r); |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 122 | ref_map->erase(this); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 123 | } else { |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 124 | (*ref_map)[this] = r; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 125 | } |
| 126 | return; |
| 127 | } |
| 128 | ref_--; |
| 129 | if (ref_ == 0) |
| 130 | Destroy(); |
| 131 | } |
| 132 | |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 133 | // Deletes this object; ref count has count reached 0. |
| 134 | void Regexp::Destroy() { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 135 | if (QuickDestroy()) |
| 136 | return; |
| 137 | |
| 138 | // Handle recursive Destroy with explicit stack |
| 139 | // to avoid arbitrarily deep recursion on process stack [sigh]. |
| 140 | down_ = NULL; |
| 141 | Regexp* stack = this; |
| 142 | while (stack != NULL) { |
| 143 | Regexp* re = stack; |
| 144 | stack = re->down_; |
| 145 | if (re->ref_ != 0) |
| 146 | LOG(DFATAL) << "Bad reference count " << re->ref_; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 147 | if (re->nsub_ > 0) { |
| 148 | Regexp** subs = re->sub(); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 149 | for (int i = 0; i < re->nsub_; i++) { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 150 | Regexp* sub = subs[i]; |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 151 | if (sub == NULL) |
| 152 | continue; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 153 | if (sub->ref_ == kMaxRef) |
| 154 | sub->Decref(); |
| 155 | else |
| 156 | --sub->ref_; |
| 157 | if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 158 | sub->down_ = stack; |
| 159 | stack = sub; |
| 160 | } |
| 161 | } |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 162 | if (re->nsub_ > 1) |
| 163 | delete[] subs; |
| 164 | re->nsub_ = 0; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 165 | } |
| 166 | delete re; |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | void Regexp::AddRuneToString(Rune r) { |
| 171 | DCHECK(op_ == kRegexpLiteralString); |
| 172 | if (nrunes_ == 0) { |
| 173 | // start with 8 |
| 174 | runes_ = new Rune[8]; |
| 175 | } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
| 176 | // double on powers of two |
| 177 | Rune *old = runes_; |
| 178 | runes_ = new Rune[nrunes_ * 2]; |
| 179 | for (int i = 0; i < nrunes_; i++) |
| 180 | runes_[i] = old[i]; |
| 181 | delete[] old; |
| 182 | } |
| 183 | |
| 184 | runes_[nrunes_++] = r; |
| 185 | } |
| 186 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 187 | Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
| 188 | Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
| 189 | re->match_id_ = match_id; |
| 190 | return re; |
| 191 | } |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 192 | |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 193 | Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { |
| 194 | // Squash **, ++ and ??. |
| 195 | if (op == sub->op() && flags == sub->parse_flags()) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 196 | return sub; |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 197 | |
| 198 | // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because |
Paul Wankadia | e2deff3 | 2017-02-28 11:09:34 +1100 | [diff] [blame] | 199 | // op is Star/Plus/Quest, we just have to check that sub->op() is too. |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 200 | if ((sub->op() == kRegexpStar || |
| 201 | sub->op() == kRegexpPlus || |
| 202 | sub->op() == kRegexpQuest) && |
| 203 | flags == sub->parse_flags()) { |
Paul Wankadia | e2deff3 | 2017-02-28 11:09:34 +1100 | [diff] [blame] | 204 | // If sub is Star, no need to rewrite it. |
| 205 | if (sub->op() == kRegexpStar) |
| 206 | return sub; |
| 207 | |
| 208 | // Rewrite sub to Star. |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 209 | Regexp* re = new Regexp(kRegexpStar, flags); |
| 210 | re->AllocSub(1); |
| 211 | re->sub()[0] = sub->sub()[0]->Incref(); |
| 212 | sub->Decref(); // We didn't consume the reference after all. |
| 213 | return re; |
| 214 | } |
| 215 | |
| 216 | Regexp* re = new Regexp(op, flags); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 217 | re->AllocSub(1); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 218 | re->sub()[0] = sub; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 219 | return re; |
| 220 | } |
| 221 | |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 222 | Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
| 223 | return StarPlusOrQuest(kRegexpPlus, sub, flags); |
| 224 | } |
| 225 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 226 | Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 227 | return StarPlusOrQuest(kRegexpStar, sub, flags); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 228 | } |
| 229 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 230 | Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
Paul Wankadia | 14d0f1c | 2017-02-28 08:48:21 +1100 | [diff] [blame] | 231 | return StarPlusOrQuest(kRegexpQuest, sub, flags); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 232 | } |
| 233 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 234 | Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
| 235 | ParseFlags flags, bool can_factor) { |
Russ Cox | 9760347 | 2010-07-15 18:26:01 -0700 | [diff] [blame] | 236 | if (nsub == 1) |
| 237 | return sub[0]; |
| 238 | |
Russ Cox | 499ef7e | 2014-12-18 12:24:33 -0500 | [diff] [blame] | 239 | if (nsub == 0) { |
| 240 | if (op == kRegexpAlternate) |
| 241 | return new Regexp(kRegexpNoMatch, flags); |
| 242 | else |
| 243 | return new Regexp(kRegexpEmptyMatch, flags); |
| 244 | } |
| 245 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 246 | Regexp** subcopy = NULL; |
| 247 | if (op == kRegexpAlternate && can_factor) { |
| 248 | // Going to edit sub; make a copy so we don't step on caller. |
| 249 | subcopy = new Regexp*[nsub]; |
| 250 | memmove(subcopy, sub, nsub * sizeof sub[0]); |
| 251 | sub = subcopy; |
| 252 | nsub = FactorAlternation(sub, nsub, flags); |
| 253 | if (nsub == 1) { |
| 254 | Regexp* re = sub[0]; |
| 255 | delete[] subcopy; |
| 256 | return re; |
| 257 | } |
| 258 | } |
| 259 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 260 | if (nsub > kMaxNsub) { |
| 261 | // Too many subexpressions to fit in a single Regexp. |
| 262 | // Make a two-level tree. Two levels gets us to 65535^2. |
| 263 | int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
| 264 | Regexp* re = new Regexp(op, flags); |
| 265 | re->AllocSub(nbigsub); |
| 266 | Regexp** subs = re->sub(); |
| 267 | for (int i = 0; i < nbigsub - 1; i++) |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 268 | subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
| 269 | subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
| 270 | nsub - (nbigsub-1)*kMaxNsub, flags, |
| 271 | false); |
| 272 | delete[] subcopy; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 273 | return re; |
| 274 | } |
| 275 | |
| 276 | Regexp* re = new Regexp(op, flags); |
| 277 | re->AllocSub(nsub); |
| 278 | Regexp** subs = re->sub(); |
| 279 | for (int i = 0; i < nsub; i++) |
| 280 | subs[i] = sub[i]; |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 281 | |
| 282 | delete[] subcopy; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 283 | return re; |
| 284 | } |
| 285 | |
| 286 | Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 287 | return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 288 | } |
| 289 | |
| 290 | Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 291 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
| 292 | } |
| 293 | |
| 294 | Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
| 295 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 296 | } |
| 297 | |
| 298 | Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
| 299 | Regexp* re = new Regexp(kRegexpCapture, flags); |
| 300 | re->AllocSub(1); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 301 | re->sub()[0] = sub; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 302 | re->cap_ = cap; |
| 303 | return re; |
| 304 | } |
| 305 | |
| 306 | Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
| 307 | Regexp* re = new Regexp(kRegexpRepeat, flags); |
| 308 | re->AllocSub(1); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 309 | re->sub()[0] = sub; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 310 | re->min_ = min; |
| 311 | re->max_ = max; |
| 312 | return re; |
| 313 | } |
| 314 | |
| 315 | Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
| 316 | Regexp* re = new Regexp(kRegexpLiteral, flags); |
| 317 | re->rune_ = rune; |
| 318 | return re; |
| 319 | } |
| 320 | |
| 321 | Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
| 322 | if (nrunes <= 0) |
| 323 | return new Regexp(kRegexpEmptyMatch, flags); |
| 324 | if (nrunes == 1) |
| 325 | return NewLiteral(runes[0], flags); |
| 326 | Regexp* re = new Regexp(kRegexpLiteralString, flags); |
| 327 | for (int i = 0; i < nrunes; i++) |
| 328 | re->AddRuneToString(runes[i]); |
| 329 | return re; |
| 330 | } |
| 331 | |
| 332 | Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
| 333 | Regexp* re = new Regexp(kRegexpCharClass, flags); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 334 | re->cc_ = cc; |
| 335 | return re; |
| 336 | } |
| 337 | |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 338 | void Regexp::Swap(Regexp* that) { |
Paul Wankadia | 6cf8ccd | 2018-05-16 02:09:21 -0700 | [diff] [blame] | 339 | // Regexp is not trivially copyable, so we cannot freely copy it with |
| 340 | // memmove(3), but swapping objects like so is safe for our purposes. |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 341 | char tmp[sizeof *this]; |
Paul Wankadia | 6cf8ccd | 2018-05-16 02:09:21 -0700 | [diff] [blame] | 342 | void* vthis = reinterpret_cast<void*>(this); |
| 343 | void* vthat = reinterpret_cast<void*>(that); |
| 344 | memmove(tmp, vthis, sizeof *this); |
| 345 | memmove(vthis, vthat, sizeof *this); |
| 346 | memmove(vthat, tmp, sizeof *this); |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 347 | } |
| 348 | |
| 349 | // Tests equality of all top-level structure but not subregexps. |
| 350 | static bool TopEqual(Regexp* a, Regexp* b) { |
| 351 | if (a->op() != b->op()) |
| 352 | return false; |
| 353 | |
| 354 | switch (a->op()) { |
| 355 | case kRegexpNoMatch: |
| 356 | case kRegexpEmptyMatch: |
| 357 | case kRegexpAnyChar: |
| 358 | case kRegexpAnyByte: |
| 359 | case kRegexpBeginLine: |
| 360 | case kRegexpEndLine: |
| 361 | case kRegexpWordBoundary: |
| 362 | case kRegexpNoWordBoundary: |
| 363 | case kRegexpBeginText: |
| 364 | return true; |
| 365 | |
| 366 | case kRegexpEndText: |
| 367 | // The parse flags remember whether it's \z or (?-m:$), |
| 368 | // which matters when testing against PCRE. |
| 369 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
| 370 | |
| 371 | case kRegexpLiteral: |
| 372 | return a->rune() == b->rune() && |
| 373 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
| 374 | |
| 375 | case kRegexpLiteralString: |
| 376 | return a->nrunes() == b->nrunes() && |
| 377 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
| 378 | memcmp(a->runes(), b->runes(), |
| 379 | a->nrunes() * sizeof a->runes()[0]) == 0; |
| 380 | |
| 381 | case kRegexpAlternate: |
| 382 | case kRegexpConcat: |
| 383 | return a->nsub() == b->nsub(); |
| 384 | |
| 385 | case kRegexpStar: |
| 386 | case kRegexpPlus: |
| 387 | case kRegexpQuest: |
| 388 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
| 389 | |
| 390 | case kRegexpRepeat: |
| 391 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
| 392 | a->min() == b->min() && |
| 393 | a->max() == b->max(); |
| 394 | |
| 395 | case kRegexpCapture: |
| 396 | return a->cap() == b->cap() && a->name() == b->name(); |
| 397 | |
| 398 | case kRegexpHaveMatch: |
| 399 | return a->match_id() == b->match_id(); |
| 400 | |
| 401 | case kRegexpCharClass: { |
| 402 | CharClass* acc = a->cc(); |
| 403 | CharClass* bcc = b->cc(); |
| 404 | return acc->size() == bcc->size() && |
| 405 | acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
| 406 | memcmp(acc->begin(), bcc->begin(), |
| 407 | (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
| 408 | } |
| 409 | } |
| 410 | |
| 411 | LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
| 412 | return 0; |
| 413 | } |
| 414 | |
| 415 | bool Regexp::Equal(Regexp* a, Regexp* b) { |
| 416 | if (a == NULL || b == NULL) |
| 417 | return a == b; |
| 418 | |
| 419 | if (!TopEqual(a, b)) |
| 420 | return false; |
| 421 | |
| 422 | // Fast path: |
| 423 | // return without allocating vector if there are no subregexps. |
| 424 | switch (a->op()) { |
| 425 | case kRegexpAlternate: |
| 426 | case kRegexpConcat: |
| 427 | case kRegexpStar: |
| 428 | case kRegexpPlus: |
| 429 | case kRegexpQuest: |
| 430 | case kRegexpRepeat: |
| 431 | case kRegexpCapture: |
| 432 | break; |
| 433 | |
| 434 | default: |
| 435 | return true; |
| 436 | } |
| 437 | |
| 438 | // Committed to doing real work. |
| 439 | // The stack (vector) has pairs of regexps waiting to |
| 440 | // be compared. The regexps are only equal if |
| 441 | // all the pairs end up being equal. |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 442 | std::vector<Regexp*> stk; |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 443 | |
| 444 | for (;;) { |
| 445 | // Invariant: TopEqual(a, b) == true. |
| 446 | Regexp* a2; |
| 447 | Regexp* b2; |
| 448 | switch (a->op()) { |
| 449 | default: |
| 450 | break; |
| 451 | case kRegexpAlternate: |
| 452 | case kRegexpConcat: |
| 453 | for (int i = 0; i < a->nsub(); i++) { |
| 454 | a2 = a->sub()[i]; |
| 455 | b2 = b->sub()[i]; |
| 456 | if (!TopEqual(a2, b2)) |
| 457 | return false; |
| 458 | stk.push_back(a2); |
| 459 | stk.push_back(b2); |
| 460 | } |
| 461 | break; |
| 462 | |
| 463 | case kRegexpStar: |
| 464 | case kRegexpPlus: |
| 465 | case kRegexpQuest: |
| 466 | case kRegexpRepeat: |
| 467 | case kRegexpCapture: |
| 468 | a2 = a->sub()[0]; |
| 469 | b2 = b->sub()[0]; |
| 470 | if (!TopEqual(a2, b2)) |
| 471 | return false; |
| 472 | // Really: |
| 473 | // stk.push_back(a2); |
| 474 | // stk.push_back(b2); |
| 475 | // break; |
| 476 | // but faster to assign directly and loop. |
| 477 | a = a2; |
| 478 | b = b2; |
| 479 | continue; |
| 480 | } |
| 481 | |
Paul Wankadia | 89567f5 | 2015-12-01 13:53:24 +1100 | [diff] [blame] | 482 | size_t n = stk.size(); |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 483 | if (n == 0) |
| 484 | break; |
| 485 | |
Paul Wankadia | 89567f5 | 2015-12-01 13:53:24 +1100 | [diff] [blame] | 486 | DCHECK_GE(n, 2); |
Russ Cox | 4a9f4ca | 2010-07-15 20:38:05 -0700 | [diff] [blame] | 487 | a = stk[n-2]; |
| 488 | b = stk[n-1]; |
| 489 | stk.resize(n-2); |
| 490 | } |
| 491 | |
| 492 | return true; |
| 493 | } |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 494 | |
| 495 | // Keep in sync with enum RegexpStatusCode in regexp.h |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 496 | static const char *kErrorStrings[] = { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 497 | "no error", |
| 498 | "unexpected error", |
| 499 | "invalid escape sequence", |
| 500 | "invalid character class", |
| 501 | "invalid character class range", |
| 502 | "missing ]", |
| 503 | "missing )", |
| 504 | "trailing \\", |
| 505 | "no argument for repetition operator", |
| 506 | "invalid repetition size", |
| 507 | "bad repetition operator", |
| 508 | "invalid perl operator", |
| 509 | "invalid UTF-8", |
| 510 | "invalid named capture group", |
| 511 | }; |
| 512 | |
Russ Cox | 1deddeb | 2012-08-21 07:35:28 -0700 | [diff] [blame] | 513 | string RegexpStatus::CodeText(enum RegexpStatusCode code) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 514 | if (code < 0 || code >= arraysize(kErrorStrings)) |
| 515 | code = kRegexpInternalError; |
| 516 | return kErrorStrings[code]; |
| 517 | } |
| 518 | |
| 519 | string RegexpStatus::Text() const { |
| 520 | if (error_arg_.empty()) |
| 521 | return CodeText(code_); |
| 522 | string s; |
| 523 | s.append(CodeText(code_)); |
| 524 | s.append(": "); |
| 525 | s.append(error_arg_.data(), error_arg_.size()); |
| 526 | return s; |
| 527 | } |
| 528 | |
| 529 | void RegexpStatus::Copy(const RegexpStatus& status) { |
| 530 | code_ = status.code_; |
| 531 | error_arg_ = status.error_arg_; |
| 532 | } |
| 533 | |
| 534 | typedef int Ignored; // Walker<void> doesn't exist |
| 535 | |
| 536 | // Walker subclass to count capturing parens in regexp. |
| 537 | class NumCapturesWalker : public Regexp::Walker<Ignored> { |
| 538 | public: |
| 539 | NumCapturesWalker() : ncapture_(0) {} |
| 540 | int ncapture() { return ncapture_; } |
| 541 | |
| 542 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 543 | if (re->op() == kRegexpCapture) |
| 544 | ncapture_++; |
| 545 | return ignored; |
| 546 | } |
| 547 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 548 | // Should never be called: we use Walk not WalkExponential. |
| 549 | LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; |
| 550 | return ignored; |
| 551 | } |
| 552 | |
| 553 | private: |
| 554 | int ncapture_; |
Paul Wankadia | f408be0 | 2016-08-16 23:42:11 +1000 | [diff] [blame] | 555 | |
| 556 | NumCapturesWalker(const NumCapturesWalker&) = delete; |
| 557 | NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 558 | }; |
| 559 | |
| 560 | int Regexp::NumCaptures() { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 561 | NumCapturesWalker w; |
| 562 | w.Walk(this, 0); |
| 563 | return w.ncapture(); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 564 | } |
| 565 | |
| 566 | // Walker class to build map of named capture groups and their indices. |
| 567 | class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
| 568 | public: |
| 569 | NamedCapturesWalker() : map_(NULL) {} |
| 570 | ~NamedCapturesWalker() { delete map_; } |
| 571 | |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 572 | std::map<string, int>* TakeMap() { |
| 573 | std::map<string, int>* m = map_; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 574 | map_ = NULL; |
| 575 | return m; |
| 576 | } |
| 577 | |
| 578 | Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 579 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
| 580 | // Allocate map once we find a name. |
| 581 | if (map_ == NULL) |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 582 | map_ = new std::map<string, int>; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 583 | |
| 584 | // Record first occurrence of each name. |
| 585 | // (The rule is that if you have the same name |
| 586 | // multiple times, only the leftmost one counts.) |
| 587 | if (map_->find(*re->name()) == map_->end()) |
| 588 | (*map_)[*re->name()] = re->cap(); |
| 589 | } |
| 590 | return ignored; |
| 591 | } |
| 592 | |
| 593 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 594 | // Should never be called: we use Walk not WalkExponential. |
| 595 | LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; |
| 596 | return ignored; |
| 597 | } |
| 598 | |
| 599 | private: |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 600 | std::map<string, int>* map_; |
Paul Wankadia | f408be0 | 2016-08-16 23:42:11 +1000 | [diff] [blame] | 601 | |
| 602 | NamedCapturesWalker(const NamedCapturesWalker&) = delete; |
| 603 | NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 604 | }; |
| 605 | |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 606 | std::map<string, int>* Regexp::NamedCaptures() { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 607 | NamedCapturesWalker w; |
| 608 | w.Walk(this, 0); |
| 609 | return w.TakeMap(); |
| 610 | } |
| 611 | |
Russ Cox | e308191 | 2011-02-01 11:09:33 -0500 | [diff] [blame] | 612 | // Walker class to build map from capture group indices to their names. |
| 613 | class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
| 614 | public: |
| 615 | CaptureNamesWalker() : map_(NULL) {} |
| 616 | ~CaptureNamesWalker() { delete map_; } |
| 617 | |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 618 | std::map<int, string>* TakeMap() { |
| 619 | std::map<int, string>* m = map_; |
Russ Cox | e308191 | 2011-02-01 11:09:33 -0500 | [diff] [blame] | 620 | map_ = NULL; |
| 621 | return m; |
| 622 | } |
| 623 | |
| 624 | Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
| 625 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
| 626 | // Allocate map once we find a name. |
| 627 | if (map_ == NULL) |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 628 | map_ = new std::map<int, string>; |
Russ Cox | e308191 | 2011-02-01 11:09:33 -0500 | [diff] [blame] | 629 | |
| 630 | (*map_)[re->cap()] = *re->name(); |
| 631 | } |
| 632 | return ignored; |
| 633 | } |
| 634 | |
| 635 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
| 636 | // Should never be called: we use Walk not WalkExponential. |
| 637 | LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; |
| 638 | return ignored; |
| 639 | } |
| 640 | |
| 641 | private: |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 642 | std::map<int, string>* map_; |
Paul Wankadia | f408be0 | 2016-08-16 23:42:11 +1000 | [diff] [blame] | 643 | |
| 644 | CaptureNamesWalker(const CaptureNamesWalker&) = delete; |
| 645 | CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; |
Russ Cox | e308191 | 2011-02-01 11:09:33 -0500 | [diff] [blame] | 646 | }; |
| 647 | |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 648 | std::map<int, string>* Regexp::CaptureNames() { |
Russ Cox | e308191 | 2011-02-01 11:09:33 -0500 | [diff] [blame] | 649 | CaptureNamesWalker w; |
| 650 | w.Walk(this, 0); |
| 651 | return w.TakeMap(); |
| 652 | } |
| 653 | |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 654 | // Determines whether regexp matches must be anchored |
| 655 | // with a fixed string prefix. If so, returns the prefix and |
| 656 | // the regexp that remains after the prefix. The prefix might |
| 657 | // be ASCII case-insensitive. |
Paul Wankadia | 7b88dbe | 2017-05-17 21:32:46 +1000 | [diff] [blame] | 658 | bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 659 | // No need for a walker: the regexp must be of the form |
| 660 | // 1. some number of ^ anchors |
| 661 | // 2. a literal char or string |
| 662 | // 3. the rest |
| 663 | prefix->clear(); |
| 664 | *foldcase = false; |
| 665 | *suffix = NULL; |
| 666 | if (op_ != kRegexpConcat) |
| 667 | return false; |
| 668 | |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 669 | // Some number of anchors, then a literal or concatenation. |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 670 | int i = 0; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 671 | Regexp** sub = this->sub(); |
| 672 | while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 673 | i++; |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 674 | if (i == 0 || i >= nsub_) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 675 | return false; |
| 676 | |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 677 | Regexp* re = sub[i]; |
| 678 | switch (re->op_) { |
| 679 | default: |
| 680 | return false; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 681 | |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 682 | case kRegexpLiteralString: |
| 683 | // Convert to string in proper encoding. |
| 684 | if (re->parse_flags() & Latin1) { |
| 685 | prefix->resize(re->nrunes_); |
| 686 | for (int j = 0; j < re->nrunes_; j++) |
Peter Kasting | 0fde84a | 2014-12-09 16:33:28 -0800 | [diff] [blame] | 687 | (*prefix)[j] = static_cast<char>(re->runes_[j]); |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 688 | } else { |
| 689 | // Convert to UTF-8 in place. |
| 690 | // Assume worst-case space and then trim. |
| 691 | prefix->resize(re->nrunes_ * UTFmax); |
| 692 | char *p = &(*prefix)[0]; |
| 693 | for (int j = 0; j < re->nrunes_; j++) { |
| 694 | Rune r = re->runes_[j]; |
| 695 | if (r < Runeself) |
Peter Kasting | 0fde84a | 2014-12-09 16:33:28 -0800 | [diff] [blame] | 696 | *p++ = static_cast<char>(r); |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 697 | else |
| 698 | p += runetochar(p, &r); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 699 | } |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 700 | prefix->resize(p - &(*prefix)[0]); |
| 701 | } |
| 702 | break; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 703 | |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 704 | case kRegexpLiteral: |
| 705 | if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { |
Peter Kasting | 0fde84a | 2014-12-09 16:33:28 -0800 | [diff] [blame] | 706 | prefix->append(1, static_cast<char>(re->rune_)); |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 707 | } else { |
| 708 | char buf[UTFmax]; |
| 709 | prefix->append(buf, runetochar(buf, &re->rune_)); |
| 710 | } |
| 711 | break; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 712 | } |
Paul Wankadia | 196ee29 | 2015-12-06 17:06:22 +1100 | [diff] [blame] | 713 | *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; |
Russ Cox | e414fec | 2011-03-02 15:59:46 -0500 | [diff] [blame] | 714 | i++; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 715 | |
| 716 | // The rest. |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 717 | if (i < nsub_) { |
| 718 | for (int j = i; j < nsub_; j++) |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 719 | sub[j]->Incref(); |
| 720 | re = Concat(sub + i, nsub_ - i, parse_flags()); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 721 | } else { |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 722 | re = new Regexp(kRegexpEmptyMatch, parse_flags()); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 723 | } |
| 724 | *suffix = re; |
| 725 | return true; |
| 726 | } |
| 727 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 728 | // Character class builder is a balanced binary tree (STL set) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 729 | // containing non-overlapping, non-abutting RuneRanges. |
| 730 | // The less-than operator used in the tree treats two |
| 731 | // ranges as equal if they overlap at all, so that |
| 732 | // lookups for a particular Rune are possible. |
| 733 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 734 | CharClassBuilder::CharClassBuilder() { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 735 | nrunes_ = 0; |
| 736 | upper_ = 0; |
| 737 | lower_ = 0; |
| 738 | } |
| 739 | |
| 740 | // Add lo-hi to the class; return whether class got bigger. |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 741 | bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 742 | if (hi < lo) |
| 743 | return false; |
| 744 | |
| 745 | if (lo <= 'z' && hi >= 'A') { |
| 746 | // Overlaps some alpha, maybe not all. |
| 747 | // Update bitmaps telling which ASCII letters are in the set. |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 748 | Rune lo1 = std::max<Rune>(lo, 'A'); |
| 749 | Rune hi1 = std::min<Rune>(hi, 'Z'); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 750 | if (lo1 <= hi1) |
| 751 | upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
| 752 | |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 753 | lo1 = std::max<Rune>(lo, 'a'); |
| 754 | hi1 = std::min<Rune>(hi, 'z'); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 755 | if (lo1 <= hi1) |
| 756 | lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
| 757 | } |
| 758 | |
| 759 | { // Check whether lo, hi is already in the class. |
| 760 | iterator it = ranges_.find(RuneRange(lo, lo)); |
| 761 | if (it != end() && it->lo <= lo && hi <= it->hi) |
| 762 | return false; |
| 763 | } |
| 764 | |
| 765 | // Look for a range abutting lo on the left. |
| 766 | // If it exists, take it out and increase our range. |
| 767 | if (lo > 0) { |
| 768 | iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
| 769 | if (it != end()) { |
| 770 | lo = it->lo; |
| 771 | if (it->hi > hi) |
| 772 | hi = it->hi; |
| 773 | nrunes_ -= it->hi - it->lo + 1; |
| 774 | ranges_.erase(it); |
| 775 | } |
| 776 | } |
| 777 | |
| 778 | // Look for a range abutting hi on the right. |
| 779 | // If it exists, take it out and increase our range. |
| 780 | if (hi < Runemax) { |
| 781 | iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
| 782 | if (it != end()) { |
| 783 | hi = it->hi; |
| 784 | nrunes_ -= it->hi - it->lo + 1; |
| 785 | ranges_.erase(it); |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | // Look for ranges between lo and hi. Take them out. |
| 790 | // This is only safe because the set has no overlapping ranges. |
| 791 | // We've already removed any ranges abutting lo and hi, so |
| 792 | // any that overlap [lo, hi] must be contained within it. |
| 793 | for (;;) { |
| 794 | iterator it = ranges_.find(RuneRange(lo, hi)); |
| 795 | if (it == end()) |
| 796 | break; |
| 797 | nrunes_ -= it->hi - it->lo + 1; |
| 798 | ranges_.erase(it); |
| 799 | } |
| 800 | |
| 801 | // Finally, add [lo, hi]. |
| 802 | nrunes_ += hi - lo + 1; |
| 803 | ranges_.insert(RuneRange(lo, hi)); |
| 804 | return true; |
| 805 | } |
| 806 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 807 | void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 808 | for (iterator it = cc->begin(); it != cc->end(); ++it) |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 809 | AddRange(it->lo, it->hi); |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 810 | } |
| 811 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 812 | bool CharClassBuilder::Contains(Rune r) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 813 | return ranges_.find(RuneRange(r, r)) != end(); |
| 814 | } |
| 815 | |
| 816 | // Does the character class behave the same on A-Z as on a-z? |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 817 | bool CharClassBuilder::FoldsASCII() { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 818 | return ((upper_ ^ lower_) & AlphaMask) == 0; |
| 819 | } |
| 820 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 821 | CharClassBuilder* CharClassBuilder::Copy() { |
| 822 | CharClassBuilder* cc = new CharClassBuilder; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 823 | for (iterator it = begin(); it != end(); ++it) |
| 824 | cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
| 825 | cc->upper_ = upper_; |
| 826 | cc->lower_ = lower_; |
| 827 | cc->nrunes_ = nrunes_; |
| 828 | return cc; |
| 829 | } |
| 830 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 831 | |
| 832 | |
| 833 | void CharClassBuilder::RemoveAbove(Rune r) { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 834 | if (r >= Runemax) |
| 835 | return; |
| 836 | |
| 837 | if (r < 'z') { |
| 838 | if (r < 'a') |
| 839 | lower_ = 0; |
| 840 | else |
| 841 | lower_ &= AlphaMask >> ('z' - r); |
| 842 | } |
| 843 | |
| 844 | if (r < 'Z') { |
| 845 | if (r < 'A') |
| 846 | upper_ = 0; |
| 847 | else |
| 848 | upper_ &= AlphaMask >> ('Z' - r); |
| 849 | } |
| 850 | |
| 851 | for (;;) { |
Russ Cox | 9760347 | 2010-07-15 18:26:01 -0700 | [diff] [blame] | 852 | |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 853 | iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
| 854 | if (it == end()) |
| 855 | break; |
| 856 | RuneRange rr = *it; |
| 857 | ranges_.erase(it); |
| 858 | nrunes_ -= rr.hi - rr.lo + 1; |
| 859 | if (rr.lo <= r) { |
| 860 | rr.hi = r; |
| 861 | ranges_.insert(rr); |
| 862 | nrunes_ += rr.hi - rr.lo + 1; |
| 863 | } |
| 864 | } |
| 865 | } |
| 866 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 867 | void CharClassBuilder::Negate() { |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 868 | // Build up negation and then copy in. |
| 869 | // Could edit ranges in place, but C++ won't let me. |
Paul Wankadia | ee55a8f | 2016-08-02 21:49:57 +1000 | [diff] [blame] | 870 | std::vector<RuneRange> v; |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 871 | v.reserve(ranges_.size() + 1); |
| 872 | |
| 873 | // In negation, first range begins at 0, unless |
| 874 | // the current class begins at 0. |
| 875 | iterator it = begin(); |
| 876 | if (it == end()) { |
| 877 | v.push_back(RuneRange(0, Runemax)); |
| 878 | } else { |
| 879 | int nextlo = 0; |
| 880 | if (it->lo == 0) { |
| 881 | nextlo = it->hi + 1; |
| 882 | ++it; |
| 883 | } |
| 884 | for (; it != end(); ++it) { |
| 885 | v.push_back(RuneRange(nextlo, it->lo - 1)); |
| 886 | nextlo = it->hi + 1; |
| 887 | } |
| 888 | if (nextlo <= Runemax) |
| 889 | v.push_back(RuneRange(nextlo, Runemax)); |
| 890 | } |
| 891 | |
| 892 | ranges_.clear(); |
Russ Cox | 11a0e6e | 2014-11-20 14:42:50 -0500 | [diff] [blame] | 893 | for (size_t i = 0; i < v.size(); i++) |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 894 | ranges_.insert(v[i]); |
| 895 | |
| 896 | upper_ = AlphaMask & ~upper_; |
| 897 | lower_ = AlphaMask & ~lower_; |
| 898 | nrunes_ = Runemax+1 - nrunes_; |
| 899 | } |
| 900 | |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 901 | // Character class is a sorted list of ranges. |
| 902 | // The ranges are allocated in the same block as the header, |
| 903 | // necessitating a special allocator and Delete method. |
| 904 | |
| 905 | CharClass* CharClass::New(int maxranges) { |
| 906 | CharClass* cc; |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 907 | uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 908 | cc = reinterpret_cast<CharClass*>(data); |
| 909 | cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
| 910 | cc->nranges_ = 0; |
| 911 | cc->folds_ascii_ = false; |
| 912 | cc->nrunes_ = 0; |
| 913 | return cc; |
| 914 | } |
| 915 | |
| 916 | void CharClass::Delete() { |
Paul Wankadia | d877825 | 2016-08-07 21:44:17 +1000 | [diff] [blame] | 917 | uint8_t* data = reinterpret_cast<uint8_t*>(this); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 918 | delete[] data; |
| 919 | } |
| 920 | |
| 921 | CharClass* CharClass::Negate() { |
| 922 | CharClass* cc = CharClass::New(nranges_+1); |
| 923 | cc->folds_ascii_ = folds_ascii_; |
| 924 | cc->nrunes_ = Runemax + 1 - nrunes_; |
| 925 | int n = 0; |
| 926 | int nextlo = 0; |
| 927 | for (CharClass::iterator it = begin(); it != end(); ++it) { |
| 928 | if (it->lo == nextlo) { |
| 929 | nextlo = it->hi + 1; |
| 930 | } else { |
| 931 | cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
| 932 | nextlo = it->hi + 1; |
| 933 | } |
| 934 | } |
| 935 | if (nextlo <= Runemax) |
| 936 | cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
| 937 | cc->nranges_ = n; |
| 938 | return cc; |
| 939 | } |
| 940 | |
| 941 | bool CharClass::Contains(Rune r) { |
| 942 | RuneRange* rr = ranges_; |
| 943 | int n = nranges_; |
| 944 | while (n > 0) { |
| 945 | int m = n/2; |
| 946 | if (rr[m].hi < r) { |
| 947 | rr += m+1; |
| 948 | n -= m+1; |
| 949 | } else if (r < rr[m].lo) { |
| 950 | n = m; |
| 951 | } else { // rr[m].lo <= r && r <= rr[m].hi |
| 952 | return true; |
| 953 | } |
| 954 | } |
| 955 | return false; |
| 956 | } |
| 957 | |
| 958 | CharClass* CharClassBuilder::GetCharClass() { |
Peter Kasting | dbb7000 | 2015-07-21 15:07:34 -0700 | [diff] [blame] | 959 | CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); |
| 960 | int n = 0; |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 961 | for (iterator it = begin(); it != end(); ++it) |
| 962 | cc->ranges_[n++] = *it; |
| 963 | cc->nranges_ = n; |
Peter Kasting | dbb7000 | 2015-07-21 15:07:34 -0700 | [diff] [blame] | 964 | DCHECK_LE(n, static_cast<int>(ranges_.size())); |
Russ Cox | 34d900b | 2010-05-10 16:03:39 -0700 | [diff] [blame] | 965 | cc->nrunes_ = nrunes_; |
| 966 | cc->folds_ascii_ = FoldsASCII(); |
| 967 | return cc; |
| 968 | } |
| 969 | |
Russ Cox | 0a38cba | 2010-03-02 17:17:51 -0800 | [diff] [blame] | 970 | } // namespace re2 |