blob: 7cfbbcbf8fe132f83288d68b4e72e2f13848abae [file] [log] [blame]
Russ Cox0a38cba2010-03-02 17:17:51 -08001// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression representation.
Russ Cox34d900b2010-05-10 16:03:39 -07006// Tested by parse_test.cc
Russ Cox0a38cba2010-03-02 17:17:51 -08007
Russ Cox0a38cba2010-03-02 17:17:51 -08008#include "re2/regexp.h"
Paul Wankadia00299462016-08-02 20:26:18 +10009
10#include <stddef.h>
Paul Wankadiad8778252016-08-07 21:44:17 +100011#include <stdint.h>
Paul Wankadia00299462016-08-02 20:26:18 +100012#include <string.h>
13#include <algorithm>
14#include <map>
15#include <mutex>
16#include <string>
17#include <vector>
18
19#include "util/util.h"
Paul Wankadiacc382ec2016-08-17 01:00:16 +100020#include "util/logging.h"
21#include "util/mutex.h"
22#include "util/utf.h"
Russ Cox0a38cba2010-03-02 17:17:51 -080023#include "re2/stringpiece.h"
24#include "re2/walker-inl.h"
25
26namespace re2 {
27
28// Constructor. Allocates vectors as appropriate for operator.
29Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
Paul Wankadiad8778252016-08-07 21:44:17 +100030 : op_(static_cast<uint8_t>(op)),
Russ Cox0a38cba2010-03-02 17:17:51 -080031 simple_(false),
Paul Wankadiad8778252016-08-07 21:44:17 +100032 parse_flags_(static_cast<uint16_t>(parse_flags)),
Russ Cox34d900b2010-05-10 16:03:39 -070033 ref_(1),
Russ Cox0a38cba2010-03-02 17:17:51 -080034 nsub_(0),
Russ Cox0a38cba2010-03-02 17:17:51 -080035 down_(NULL) {
Russ Cox34d900b2010-05-10 16:03:39 -070036 subone_ = NULL;
37 memset(the_union_, 0, sizeof the_union_);
Russ Cox0a38cba2010-03-02 17:17:51 -080038}
39
40// Destructor. Assumes already cleaned up children.
41// Private: use Decref() instead of delete to destroy Regexps.
42// Can't call Decref on the sub-Regexps here because
43// that could cause arbitrarily deep recursion, so
44// required Decref() to have handled them for us.
45Regexp::~Regexp() {
Russ Cox34d900b2010-05-10 16:03:39 -070046 if (nsub_ > 0)
Russ Cox0a38cba2010-03-02 17:17:51 -080047 LOG(DFATAL) << "Regexp not destroyed.";
48
Russ Cox34d900b2010-05-10 16:03:39 -070049 switch (op_) {
50 default:
51 break;
52 case kRegexpCapture:
53 delete name_;
54 break;
55 case kRegexpLiteralString:
56 delete[] runes_;
57 break;
58 case kRegexpCharClass:
Russ Cox499ef7e2014-12-18 12:24:33 -050059 if (cc_)
60 cc_->Delete();
Russ Cox34d900b2010-05-10 16:03:39 -070061 delete ccb_;
62 break;
63 }
Russ Cox0a38cba2010-03-02 17:17:51 -080064}
65
66// If it's possible to destroy this regexp without recurring,
67// do so and return true. Else return false.
68bool Regexp::QuickDestroy() {
Russ Cox34d900b2010-05-10 16:03:39 -070069 if (nsub_ == 0) {
Russ Cox0a38cba2010-03-02 17:17:51 -080070 delete this;
71 return true;
72 }
73 return false;
74}
75
Paul Wankadia88d55832016-03-02 01:32:05 +110076// Lazily allocated.
Paul Wankadia55b07082016-03-03 14:25:27 +110077static Mutex* ref_mutex;
Paul Wankadiaee55a8f2016-08-02 21:49:57 +100078static std::map<Regexp*, int>* ref_map;
Russ Cox34d900b2010-05-10 16:03:39 -070079
80int Regexp::Ref() {
81 if (ref_ < kMaxRef)
82 return ref_;
Russ Cox97603472010-07-15 18:26:01 -070083
Paul Wankadia88d55832016-03-02 01:32:05 +110084 MutexLock l(ref_mutex);
85 return (*ref_map)[this];
Russ Cox34d900b2010-05-10 16:03:39 -070086}
87
88// Increments reference count, returns object as convenience.
89Regexp* Regexp::Incref() {
90 if (ref_ >= kMaxRef-1) {
Paul Wankadia88d55832016-03-02 01:32:05 +110091 static std::once_flag ref_once;
92 std::call_once(ref_once, []() {
93 ref_mutex = new Mutex;
Paul Wankadiaee55a8f2016-08-02 21:49:57 +100094 ref_map = new std::map<Regexp*, int>;
Paul Wankadia88d55832016-03-02 01:32:05 +110095 });
96
97 // Store ref count in overflow map.
98 MutexLock l(ref_mutex);
Russ Cox1deddeb2012-08-21 07:35:28 -070099 if (ref_ == kMaxRef) {
100 // already overflowed
101 (*ref_map)[this]++;
102 } else {
103 // overflowing now
104 (*ref_map)[this] = kMaxRef;
105 ref_ = kMaxRef;
106 }
Russ Cox34d900b2010-05-10 16:03:39 -0700107 return this;
108 }
109
110 ref_++;
111 return this;
112}
113
114// Decrements reference count and deletes this object if count reaches 0.
115void Regexp::Decref() {
116 if (ref_ == kMaxRef) {
117 // Ref count is stored in overflow map.
Paul Wankadia88d55832016-03-02 01:32:05 +1100118 MutexLock l(ref_mutex);
Russ Cox1deddeb2012-08-21 07:35:28 -0700119 int r = (*ref_map)[this] - 1;
Russ Cox34d900b2010-05-10 16:03:39 -0700120 if (r < kMaxRef) {
Paul Wankadiad8778252016-08-07 21:44:17 +1000121 ref_ = static_cast<uint16_t>(r);
Russ Cox1deddeb2012-08-21 07:35:28 -0700122 ref_map->erase(this);
Russ Cox34d900b2010-05-10 16:03:39 -0700123 } else {
Russ Cox1deddeb2012-08-21 07:35:28 -0700124 (*ref_map)[this] = r;
Russ Cox34d900b2010-05-10 16:03:39 -0700125 }
126 return;
127 }
128 ref_--;
129 if (ref_ == 0)
130 Destroy();
131}
132
Russ Cox0a38cba2010-03-02 17:17:51 -0800133// Deletes this object; ref count has count reached 0.
134void Regexp::Destroy() {
Russ Cox0a38cba2010-03-02 17:17:51 -0800135 if (QuickDestroy())
136 return;
137
138 // Handle recursive Destroy with explicit stack
139 // to avoid arbitrarily deep recursion on process stack [sigh].
140 down_ = NULL;
141 Regexp* stack = this;
142 while (stack != NULL) {
143 Regexp* re = stack;
144 stack = re->down_;
145 if (re->ref_ != 0)
146 LOG(DFATAL) << "Bad reference count " << re->ref_;
Russ Cox34d900b2010-05-10 16:03:39 -0700147 if (re->nsub_ > 0) {
148 Regexp** subs = re->sub();
Russ Cox0a38cba2010-03-02 17:17:51 -0800149 for (int i = 0; i < re->nsub_; i++) {
Russ Cox34d900b2010-05-10 16:03:39 -0700150 Regexp* sub = subs[i];
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700151 if (sub == NULL)
152 continue;
Russ Cox34d900b2010-05-10 16:03:39 -0700153 if (sub->ref_ == kMaxRef)
154 sub->Decref();
155 else
156 --sub->ref_;
157 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800158 sub->down_ = stack;
159 stack = sub;
160 }
161 }
Russ Cox34d900b2010-05-10 16:03:39 -0700162 if (re->nsub_ > 1)
163 delete[] subs;
164 re->nsub_ = 0;
Russ Cox0a38cba2010-03-02 17:17:51 -0800165 }
166 delete re;
167 }
168}
169
170void Regexp::AddRuneToString(Rune r) {
171 DCHECK(op_ == kRegexpLiteralString);
172 if (nrunes_ == 0) {
173 // start with 8
174 runes_ = new Rune[8];
175 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
176 // double on powers of two
177 Rune *old = runes_;
178 runes_ = new Rune[nrunes_ * 2];
179 for (int i = 0; i < nrunes_; i++)
180 runes_[i] = old[i];
181 delete[] old;
182 }
183
184 runes_[nrunes_++] = r;
185}
186
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700187Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
189 re->match_id_ = match_id;
190 return re;
191}
Russ Cox0a38cba2010-03-02 17:17:51 -0800192
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100193Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194 // Squash **, ++ and ??.
195 if (op == sub->op() && flags == sub->parse_flags())
Russ Cox0a38cba2010-03-02 17:17:51 -0800196 return sub;
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100197
198 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
Paul Wankadiae2deff32017-02-28 11:09:34 +1100199 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100200 if ((sub->op() == kRegexpStar ||
201 sub->op() == kRegexpPlus ||
202 sub->op() == kRegexpQuest) &&
203 flags == sub->parse_flags()) {
Paul Wankadiae2deff32017-02-28 11:09:34 +1100204 // If sub is Star, no need to rewrite it.
205 if (sub->op() == kRegexpStar)
206 return sub;
207
208 // Rewrite sub to Star.
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100209 Regexp* re = new Regexp(kRegexpStar, flags);
210 re->AllocSub(1);
211 re->sub()[0] = sub->sub()[0]->Incref();
212 sub->Decref(); // We didn't consume the reference after all.
213 return re;
214 }
215
216 Regexp* re = new Regexp(op, flags);
Russ Cox0a38cba2010-03-02 17:17:51 -0800217 re->AllocSub(1);
Russ Cox34d900b2010-05-10 16:03:39 -0700218 re->sub()[0] = sub;
Russ Cox0a38cba2010-03-02 17:17:51 -0800219 return re;
220}
221
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100222Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223 return StarPlusOrQuest(kRegexpPlus, sub, flags);
224}
225
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700226Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100227 return StarPlusOrQuest(kRegexpStar, sub, flags);
Russ Cox0a38cba2010-03-02 17:17:51 -0800228}
229
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700230Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
Paul Wankadia14d0f1c2017-02-28 08:48:21 +1100231 return StarPlusOrQuest(kRegexpQuest, sub, flags);
Russ Cox34d900b2010-05-10 16:03:39 -0700232}
233
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700234Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235 ParseFlags flags, bool can_factor) {
Russ Cox97603472010-07-15 18:26:01 -0700236 if (nsub == 1)
237 return sub[0];
238
Russ Cox499ef7e2014-12-18 12:24:33 -0500239 if (nsub == 0) {
240 if (op == kRegexpAlternate)
241 return new Regexp(kRegexpNoMatch, flags);
242 else
243 return new Regexp(kRegexpEmptyMatch, flags);
244 }
245
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700246 Regexp** subcopy = NULL;
247 if (op == kRegexpAlternate && can_factor) {
248 // Going to edit sub; make a copy so we don't step on caller.
249 subcopy = new Regexp*[nsub];
250 memmove(subcopy, sub, nsub * sizeof sub[0]);
251 sub = subcopy;
252 nsub = FactorAlternation(sub, nsub, flags);
253 if (nsub == 1) {
254 Regexp* re = sub[0];
255 delete[] subcopy;
256 return re;
257 }
258 }
259
Russ Cox34d900b2010-05-10 16:03:39 -0700260 if (nsub > kMaxNsub) {
261 // Too many subexpressions to fit in a single Regexp.
262 // Make a two-level tree. Two levels gets us to 65535^2.
263 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264 Regexp* re = new Regexp(op, flags);
265 re->AllocSub(nbigsub);
266 Regexp** subs = re->sub();
267 for (int i = 0; i < nbigsub - 1; i++)
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700268 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270 nsub - (nbigsub-1)*kMaxNsub, flags,
271 false);
272 delete[] subcopy;
Russ Cox34d900b2010-05-10 16:03:39 -0700273 return re;
274 }
275
276 Regexp* re = new Regexp(op, flags);
277 re->AllocSub(nsub);
278 Regexp** subs = re->sub();
279 for (int i = 0; i < nsub; i++)
280 subs[i] = sub[i];
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700281
282 delete[] subcopy;
Russ Cox0a38cba2010-03-02 17:17:51 -0800283 return re;
284}
285
286Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700287 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
Russ Cox0a38cba2010-03-02 17:17:51 -0800288}
289
290Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700291 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
292}
293
294Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
295 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
Russ Cox0a38cba2010-03-02 17:17:51 -0800296}
297
298Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
299 Regexp* re = new Regexp(kRegexpCapture, flags);
300 re->AllocSub(1);
Russ Cox34d900b2010-05-10 16:03:39 -0700301 re->sub()[0] = sub;
Russ Cox0a38cba2010-03-02 17:17:51 -0800302 re->cap_ = cap;
303 return re;
304}
305
306Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
307 Regexp* re = new Regexp(kRegexpRepeat, flags);
308 re->AllocSub(1);
Russ Cox34d900b2010-05-10 16:03:39 -0700309 re->sub()[0] = sub;
Russ Cox0a38cba2010-03-02 17:17:51 -0800310 re->min_ = min;
311 re->max_ = max;
312 return re;
313}
314
315Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
316 Regexp* re = new Regexp(kRegexpLiteral, flags);
317 re->rune_ = rune;
318 return re;
319}
320
321Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
322 if (nrunes <= 0)
323 return new Regexp(kRegexpEmptyMatch, flags);
324 if (nrunes == 1)
325 return NewLiteral(runes[0], flags);
326 Regexp* re = new Regexp(kRegexpLiteralString, flags);
327 for (int i = 0; i < nrunes; i++)
328 re->AddRuneToString(runes[i]);
329 return re;
330}
331
332Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
333 Regexp* re = new Regexp(kRegexpCharClass, flags);
Russ Cox0a38cba2010-03-02 17:17:51 -0800334 re->cc_ = cc;
335 return re;
336}
337
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700338void Regexp::Swap(Regexp* that) {
Paul Wankadia6cf8ccd2018-05-16 02:09:21 -0700339 // Regexp is not trivially copyable, so we cannot freely copy it with
340 // memmove(3), but swapping objects like so is safe for our purposes.
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700341 char tmp[sizeof *this];
Paul Wankadia6cf8ccd2018-05-16 02:09:21 -0700342 void* vthis = reinterpret_cast<void*>(this);
343 void* vthat = reinterpret_cast<void*>(that);
344 memmove(tmp, vthis, sizeof *this);
345 memmove(vthis, vthat, sizeof *this);
346 memmove(vthat, tmp, sizeof *this);
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700347}
348
349// Tests equality of all top-level structure but not subregexps.
350static bool TopEqual(Regexp* a, Regexp* b) {
351 if (a->op() != b->op())
352 return false;
353
354 switch (a->op()) {
355 case kRegexpNoMatch:
356 case kRegexpEmptyMatch:
357 case kRegexpAnyChar:
358 case kRegexpAnyByte:
359 case kRegexpBeginLine:
360 case kRegexpEndLine:
361 case kRegexpWordBoundary:
362 case kRegexpNoWordBoundary:
363 case kRegexpBeginText:
364 return true;
365
366 case kRegexpEndText:
367 // The parse flags remember whether it's \z or (?-m:$),
368 // which matters when testing against PCRE.
369 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
370
371 case kRegexpLiteral:
372 return a->rune() == b->rune() &&
373 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
374
375 case kRegexpLiteralString:
376 return a->nrunes() == b->nrunes() &&
377 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
378 memcmp(a->runes(), b->runes(),
379 a->nrunes() * sizeof a->runes()[0]) == 0;
380
381 case kRegexpAlternate:
382 case kRegexpConcat:
383 return a->nsub() == b->nsub();
384
385 case kRegexpStar:
386 case kRegexpPlus:
387 case kRegexpQuest:
388 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
389
390 case kRegexpRepeat:
391 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
392 a->min() == b->min() &&
393 a->max() == b->max();
394
395 case kRegexpCapture:
396 return a->cap() == b->cap() && a->name() == b->name();
397
398 case kRegexpHaveMatch:
399 return a->match_id() == b->match_id();
400
401 case kRegexpCharClass: {
402 CharClass* acc = a->cc();
403 CharClass* bcc = b->cc();
404 return acc->size() == bcc->size() &&
405 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
406 memcmp(acc->begin(), bcc->begin(),
407 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
408 }
409 }
410
411 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
412 return 0;
413}
414
415bool Regexp::Equal(Regexp* a, Regexp* b) {
416 if (a == NULL || b == NULL)
417 return a == b;
418
419 if (!TopEqual(a, b))
420 return false;
421
422 // Fast path:
423 // return without allocating vector if there are no subregexps.
424 switch (a->op()) {
425 case kRegexpAlternate:
426 case kRegexpConcat:
427 case kRegexpStar:
428 case kRegexpPlus:
429 case kRegexpQuest:
430 case kRegexpRepeat:
431 case kRegexpCapture:
432 break;
433
434 default:
435 return true;
436 }
437
438 // Committed to doing real work.
439 // The stack (vector) has pairs of regexps waiting to
440 // be compared. The regexps are only equal if
441 // all the pairs end up being equal.
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000442 std::vector<Regexp*> stk;
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700443
444 for (;;) {
445 // Invariant: TopEqual(a, b) == true.
446 Regexp* a2;
447 Regexp* b2;
448 switch (a->op()) {
449 default:
450 break;
451 case kRegexpAlternate:
452 case kRegexpConcat:
453 for (int i = 0; i < a->nsub(); i++) {
454 a2 = a->sub()[i];
455 b2 = b->sub()[i];
456 if (!TopEqual(a2, b2))
457 return false;
458 stk.push_back(a2);
459 stk.push_back(b2);
460 }
461 break;
462
463 case kRegexpStar:
464 case kRegexpPlus:
465 case kRegexpQuest:
466 case kRegexpRepeat:
467 case kRegexpCapture:
468 a2 = a->sub()[0];
469 b2 = b->sub()[0];
470 if (!TopEqual(a2, b2))
471 return false;
472 // Really:
473 // stk.push_back(a2);
474 // stk.push_back(b2);
475 // break;
476 // but faster to assign directly and loop.
477 a = a2;
478 b = b2;
479 continue;
480 }
481
Paul Wankadia89567f52015-12-01 13:53:24 +1100482 size_t n = stk.size();
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700483 if (n == 0)
484 break;
485
Paul Wankadia89567f52015-12-01 13:53:24 +1100486 DCHECK_GE(n, 2);
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700487 a = stk[n-2];
488 b = stk[n-1];
489 stk.resize(n-2);
490 }
491
492 return true;
493}
Russ Cox0a38cba2010-03-02 17:17:51 -0800494
495// Keep in sync with enum RegexpStatusCode in regexp.h
Russ Cox1deddeb2012-08-21 07:35:28 -0700496static const char *kErrorStrings[] = {
Russ Cox0a38cba2010-03-02 17:17:51 -0800497 "no error",
498 "unexpected error",
499 "invalid escape sequence",
500 "invalid character class",
501 "invalid character class range",
502 "missing ]",
503 "missing )",
504 "trailing \\",
505 "no argument for repetition operator",
506 "invalid repetition size",
507 "bad repetition operator",
508 "invalid perl operator",
509 "invalid UTF-8",
510 "invalid named capture group",
511};
512
Russ Cox1deddeb2012-08-21 07:35:28 -0700513string RegexpStatus::CodeText(enum RegexpStatusCode code) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800514 if (code < 0 || code >= arraysize(kErrorStrings))
515 code = kRegexpInternalError;
516 return kErrorStrings[code];
517}
518
519string RegexpStatus::Text() const {
520 if (error_arg_.empty())
521 return CodeText(code_);
522 string s;
523 s.append(CodeText(code_));
524 s.append(": ");
525 s.append(error_arg_.data(), error_arg_.size());
526 return s;
527}
528
529void RegexpStatus::Copy(const RegexpStatus& status) {
530 code_ = status.code_;
531 error_arg_ = status.error_arg_;
532}
533
534typedef int Ignored; // Walker<void> doesn't exist
535
536// Walker subclass to count capturing parens in regexp.
537class NumCapturesWalker : public Regexp::Walker<Ignored> {
538 public:
539 NumCapturesWalker() : ncapture_(0) {}
540 int ncapture() { return ncapture_; }
541
542 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
543 if (re->op() == kRegexpCapture)
544 ncapture_++;
545 return ignored;
546 }
547 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
548 // Should never be called: we use Walk not WalkExponential.
549 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550 return ignored;
551 }
552
553 private:
554 int ncapture_;
Paul Wankadiaf408be02016-08-16 23:42:11 +1000555
556 NumCapturesWalker(const NumCapturesWalker&) = delete;
557 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
Russ Cox0a38cba2010-03-02 17:17:51 -0800558};
559
560int Regexp::NumCaptures() {
Russ Cox34d900b2010-05-10 16:03:39 -0700561 NumCapturesWalker w;
562 w.Walk(this, 0);
563 return w.ncapture();
Russ Cox0a38cba2010-03-02 17:17:51 -0800564}
565
566// Walker class to build map of named capture groups and their indices.
567class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568 public:
569 NamedCapturesWalker() : map_(NULL) {}
570 ~NamedCapturesWalker() { delete map_; }
571
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000572 std::map<string, int>* TakeMap() {
573 std::map<string, int>* m = map_;
Russ Cox0a38cba2010-03-02 17:17:51 -0800574 map_ = NULL;
575 return m;
576 }
577
578 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579 if (re->op() == kRegexpCapture && re->name() != NULL) {
580 // Allocate map once we find a name.
581 if (map_ == NULL)
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000582 map_ = new std::map<string, int>;
Russ Cox0a38cba2010-03-02 17:17:51 -0800583
584 // Record first occurrence of each name.
585 // (The rule is that if you have the same name
586 // multiple times, only the leftmost one counts.)
587 if (map_->find(*re->name()) == map_->end())
588 (*map_)[*re->name()] = re->cap();
589 }
590 return ignored;
591 }
592
593 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594 // Should never be called: we use Walk not WalkExponential.
595 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596 return ignored;
597 }
598
599 private:
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000600 std::map<string, int>* map_;
Paul Wankadiaf408be02016-08-16 23:42:11 +1000601
602 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
603 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
Russ Cox0a38cba2010-03-02 17:17:51 -0800604};
605
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000606std::map<string, int>* Regexp::NamedCaptures() {
Russ Cox0a38cba2010-03-02 17:17:51 -0800607 NamedCapturesWalker w;
608 w.Walk(this, 0);
609 return w.TakeMap();
610}
611
Russ Coxe3081912011-02-01 11:09:33 -0500612// Walker class to build map from capture group indices to their names.
613class CaptureNamesWalker : public Regexp::Walker<Ignored> {
614 public:
615 CaptureNamesWalker() : map_(NULL) {}
616 ~CaptureNamesWalker() { delete map_; }
617
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000618 std::map<int, string>* TakeMap() {
619 std::map<int, string>* m = map_;
Russ Coxe3081912011-02-01 11:09:33 -0500620 map_ = NULL;
621 return m;
622 }
623
624 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
625 if (re->op() == kRegexpCapture && re->name() != NULL) {
626 // Allocate map once we find a name.
627 if (map_ == NULL)
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000628 map_ = new std::map<int, string>;
Russ Coxe3081912011-02-01 11:09:33 -0500629
630 (*map_)[re->cap()] = *re->name();
631 }
632 return ignored;
633 }
634
635 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
636 // Should never be called: we use Walk not WalkExponential.
637 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
638 return ignored;
639 }
640
641 private:
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000642 std::map<int, string>* map_;
Paul Wankadiaf408be02016-08-16 23:42:11 +1000643
644 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
645 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
Russ Coxe3081912011-02-01 11:09:33 -0500646};
647
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000648std::map<int, string>* Regexp::CaptureNames() {
Russ Coxe3081912011-02-01 11:09:33 -0500649 CaptureNamesWalker w;
650 w.Walk(this, 0);
651 return w.TakeMap();
652}
653
Russ Cox0a38cba2010-03-02 17:17:51 -0800654// Determines whether regexp matches must be anchored
655// with a fixed string prefix. If so, returns the prefix and
656// the regexp that remains after the prefix. The prefix might
657// be ASCII case-insensitive.
Paul Wankadia7b88dbe2017-05-17 21:32:46 +1000658bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800659 // No need for a walker: the regexp must be of the form
660 // 1. some number of ^ anchors
661 // 2. a literal char or string
662 // 3. the rest
663 prefix->clear();
664 *foldcase = false;
665 *suffix = NULL;
666 if (op_ != kRegexpConcat)
667 return false;
668
Russ Coxe414fec2011-03-02 15:59:46 -0500669 // Some number of anchors, then a literal or concatenation.
Russ Cox0a38cba2010-03-02 17:17:51 -0800670 int i = 0;
Russ Cox34d900b2010-05-10 16:03:39 -0700671 Regexp** sub = this->sub();
672 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
Russ Cox0a38cba2010-03-02 17:17:51 -0800673 i++;
Russ Coxe414fec2011-03-02 15:59:46 -0500674 if (i == 0 || i >= nsub_)
Russ Cox0a38cba2010-03-02 17:17:51 -0800675 return false;
676
Russ Coxe414fec2011-03-02 15:59:46 -0500677 Regexp* re = sub[i];
678 switch (re->op_) {
679 default:
680 return false;
Russ Cox0a38cba2010-03-02 17:17:51 -0800681
Russ Coxe414fec2011-03-02 15:59:46 -0500682 case kRegexpLiteralString:
683 // Convert to string in proper encoding.
684 if (re->parse_flags() & Latin1) {
685 prefix->resize(re->nrunes_);
686 for (int j = 0; j < re->nrunes_; j++)
Peter Kasting0fde84a2014-12-09 16:33:28 -0800687 (*prefix)[j] = static_cast<char>(re->runes_[j]);
Russ Coxe414fec2011-03-02 15:59:46 -0500688 } else {
689 // Convert to UTF-8 in place.
690 // Assume worst-case space and then trim.
691 prefix->resize(re->nrunes_ * UTFmax);
692 char *p = &(*prefix)[0];
693 for (int j = 0; j < re->nrunes_; j++) {
694 Rune r = re->runes_[j];
695 if (r < Runeself)
Peter Kasting0fde84a2014-12-09 16:33:28 -0800696 *p++ = static_cast<char>(r);
Russ Coxe414fec2011-03-02 15:59:46 -0500697 else
698 p += runetochar(p, &r);
Russ Cox0a38cba2010-03-02 17:17:51 -0800699 }
Russ Coxe414fec2011-03-02 15:59:46 -0500700 prefix->resize(p - &(*prefix)[0]);
701 }
702 break;
Russ Cox0a38cba2010-03-02 17:17:51 -0800703
Russ Coxe414fec2011-03-02 15:59:46 -0500704 case kRegexpLiteral:
705 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
Peter Kasting0fde84a2014-12-09 16:33:28 -0800706 prefix->append(1, static_cast<char>(re->rune_));
Russ Coxe414fec2011-03-02 15:59:46 -0500707 } else {
708 char buf[UTFmax];
709 prefix->append(buf, runetochar(buf, &re->rune_));
710 }
711 break;
Russ Cox0a38cba2010-03-02 17:17:51 -0800712 }
Paul Wankadia196ee292015-12-06 17:06:22 +1100713 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
Russ Coxe414fec2011-03-02 15:59:46 -0500714 i++;
Russ Cox0a38cba2010-03-02 17:17:51 -0800715
716 // The rest.
Russ Cox0a38cba2010-03-02 17:17:51 -0800717 if (i < nsub_) {
718 for (int j = i; j < nsub_; j++)
Russ Cox34d900b2010-05-10 16:03:39 -0700719 sub[j]->Incref();
720 re = Concat(sub + i, nsub_ - i, parse_flags());
Russ Cox0a38cba2010-03-02 17:17:51 -0800721 } else {
Russ Cox34d900b2010-05-10 16:03:39 -0700722 re = new Regexp(kRegexpEmptyMatch, parse_flags());
Russ Cox0a38cba2010-03-02 17:17:51 -0800723 }
724 *suffix = re;
725 return true;
726}
727
Russ Cox34d900b2010-05-10 16:03:39 -0700728// Character class builder is a balanced binary tree (STL set)
Russ Cox0a38cba2010-03-02 17:17:51 -0800729// containing non-overlapping, non-abutting RuneRanges.
730// The less-than operator used in the tree treats two
731// ranges as equal if they overlap at all, so that
732// lookups for a particular Rune are possible.
733
Russ Cox34d900b2010-05-10 16:03:39 -0700734CharClassBuilder::CharClassBuilder() {
Russ Cox0a38cba2010-03-02 17:17:51 -0800735 nrunes_ = 0;
736 upper_ = 0;
737 lower_ = 0;
738}
739
740// Add lo-hi to the class; return whether class got bigger.
Russ Cox34d900b2010-05-10 16:03:39 -0700741bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800742 if (hi < lo)
743 return false;
744
745 if (lo <= 'z' && hi >= 'A') {
746 // Overlaps some alpha, maybe not all.
747 // Update bitmaps telling which ASCII letters are in the set.
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000748 Rune lo1 = std::max<Rune>(lo, 'A');
749 Rune hi1 = std::min<Rune>(hi, 'Z');
Russ Cox0a38cba2010-03-02 17:17:51 -0800750 if (lo1 <= hi1)
751 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
752
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000753 lo1 = std::max<Rune>(lo, 'a');
754 hi1 = std::min<Rune>(hi, 'z');
Russ Cox0a38cba2010-03-02 17:17:51 -0800755 if (lo1 <= hi1)
756 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
757 }
758
759 { // Check whether lo, hi is already in the class.
760 iterator it = ranges_.find(RuneRange(lo, lo));
761 if (it != end() && it->lo <= lo && hi <= it->hi)
762 return false;
763 }
764
765 // Look for a range abutting lo on the left.
766 // If it exists, take it out and increase our range.
767 if (lo > 0) {
768 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
769 if (it != end()) {
770 lo = it->lo;
771 if (it->hi > hi)
772 hi = it->hi;
773 nrunes_ -= it->hi - it->lo + 1;
774 ranges_.erase(it);
775 }
776 }
777
778 // Look for a range abutting hi on the right.
779 // If it exists, take it out and increase our range.
780 if (hi < Runemax) {
781 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
782 if (it != end()) {
783 hi = it->hi;
784 nrunes_ -= it->hi - it->lo + 1;
785 ranges_.erase(it);
786 }
787 }
788
789 // Look for ranges between lo and hi. Take them out.
790 // This is only safe because the set has no overlapping ranges.
791 // We've already removed any ranges abutting lo and hi, so
792 // any that overlap [lo, hi] must be contained within it.
793 for (;;) {
794 iterator it = ranges_.find(RuneRange(lo, hi));
795 if (it == end())
796 break;
797 nrunes_ -= it->hi - it->lo + 1;
798 ranges_.erase(it);
799 }
800
801 // Finally, add [lo, hi].
802 nrunes_ += hi - lo + 1;
803 ranges_.insert(RuneRange(lo, hi));
804 return true;
805}
806
Russ Cox34d900b2010-05-10 16:03:39 -0700807void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800808 for (iterator it = cc->begin(); it != cc->end(); ++it)
Russ Cox34d900b2010-05-10 16:03:39 -0700809 AddRange(it->lo, it->hi);
Russ Cox0a38cba2010-03-02 17:17:51 -0800810}
811
Russ Cox34d900b2010-05-10 16:03:39 -0700812bool CharClassBuilder::Contains(Rune r) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800813 return ranges_.find(RuneRange(r, r)) != end();
814}
815
816// Does the character class behave the same on A-Z as on a-z?
Russ Cox34d900b2010-05-10 16:03:39 -0700817bool CharClassBuilder::FoldsASCII() {
Russ Cox0a38cba2010-03-02 17:17:51 -0800818 return ((upper_ ^ lower_) & AlphaMask) == 0;
819}
820
Russ Cox34d900b2010-05-10 16:03:39 -0700821CharClassBuilder* CharClassBuilder::Copy() {
822 CharClassBuilder* cc = new CharClassBuilder;
Russ Cox0a38cba2010-03-02 17:17:51 -0800823 for (iterator it = begin(); it != end(); ++it)
824 cc->ranges_.insert(RuneRange(it->lo, it->hi));
825 cc->upper_ = upper_;
826 cc->lower_ = lower_;
827 cc->nrunes_ = nrunes_;
828 return cc;
829}
830
Russ Cox34d900b2010-05-10 16:03:39 -0700831
832
833void CharClassBuilder::RemoveAbove(Rune r) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800834 if (r >= Runemax)
835 return;
836
837 if (r < 'z') {
838 if (r < 'a')
839 lower_ = 0;
840 else
841 lower_ &= AlphaMask >> ('z' - r);
842 }
843
844 if (r < 'Z') {
845 if (r < 'A')
846 upper_ = 0;
847 else
848 upper_ &= AlphaMask >> ('Z' - r);
849 }
850
851 for (;;) {
Russ Cox97603472010-07-15 18:26:01 -0700852
Russ Cox0a38cba2010-03-02 17:17:51 -0800853 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
854 if (it == end())
855 break;
856 RuneRange rr = *it;
857 ranges_.erase(it);
858 nrunes_ -= rr.hi - rr.lo + 1;
859 if (rr.lo <= r) {
860 rr.hi = r;
861 ranges_.insert(rr);
862 nrunes_ += rr.hi - rr.lo + 1;
863 }
864 }
865}
866
Russ Cox34d900b2010-05-10 16:03:39 -0700867void CharClassBuilder::Negate() {
Russ Cox0a38cba2010-03-02 17:17:51 -0800868 // Build up negation and then copy in.
869 // Could edit ranges in place, but C++ won't let me.
Paul Wankadiaee55a8f2016-08-02 21:49:57 +1000870 std::vector<RuneRange> v;
Russ Cox0a38cba2010-03-02 17:17:51 -0800871 v.reserve(ranges_.size() + 1);
872
873 // In negation, first range begins at 0, unless
874 // the current class begins at 0.
875 iterator it = begin();
876 if (it == end()) {
877 v.push_back(RuneRange(0, Runemax));
878 } else {
879 int nextlo = 0;
880 if (it->lo == 0) {
881 nextlo = it->hi + 1;
882 ++it;
883 }
884 for (; it != end(); ++it) {
885 v.push_back(RuneRange(nextlo, it->lo - 1));
886 nextlo = it->hi + 1;
887 }
888 if (nextlo <= Runemax)
889 v.push_back(RuneRange(nextlo, Runemax));
890 }
891
892 ranges_.clear();
Russ Cox11a0e6e2014-11-20 14:42:50 -0500893 for (size_t i = 0; i < v.size(); i++)
Russ Cox0a38cba2010-03-02 17:17:51 -0800894 ranges_.insert(v[i]);
895
896 upper_ = AlphaMask & ~upper_;
897 lower_ = AlphaMask & ~lower_;
898 nrunes_ = Runemax+1 - nrunes_;
899}
900
Russ Cox34d900b2010-05-10 16:03:39 -0700901// Character class is a sorted list of ranges.
902// The ranges are allocated in the same block as the header,
903// necessitating a special allocator and Delete method.
904
905CharClass* CharClass::New(int maxranges) {
906 CharClass* cc;
Paul Wankadiad8778252016-08-07 21:44:17 +1000907 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
Russ Cox34d900b2010-05-10 16:03:39 -0700908 cc = reinterpret_cast<CharClass*>(data);
909 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
910 cc->nranges_ = 0;
911 cc->folds_ascii_ = false;
912 cc->nrunes_ = 0;
913 return cc;
914}
915
916void CharClass::Delete() {
Paul Wankadiad8778252016-08-07 21:44:17 +1000917 uint8_t* data = reinterpret_cast<uint8_t*>(this);
Russ Cox34d900b2010-05-10 16:03:39 -0700918 delete[] data;
919}
920
921CharClass* CharClass::Negate() {
922 CharClass* cc = CharClass::New(nranges_+1);
923 cc->folds_ascii_ = folds_ascii_;
924 cc->nrunes_ = Runemax + 1 - nrunes_;
925 int n = 0;
926 int nextlo = 0;
927 for (CharClass::iterator it = begin(); it != end(); ++it) {
928 if (it->lo == nextlo) {
929 nextlo = it->hi + 1;
930 } else {
931 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
932 nextlo = it->hi + 1;
933 }
934 }
935 if (nextlo <= Runemax)
936 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
937 cc->nranges_ = n;
938 return cc;
939}
940
941bool CharClass::Contains(Rune r) {
942 RuneRange* rr = ranges_;
943 int n = nranges_;
944 while (n > 0) {
945 int m = n/2;
946 if (rr[m].hi < r) {
947 rr += m+1;
948 n -= m+1;
949 } else if (r < rr[m].lo) {
950 n = m;
951 } else { // rr[m].lo <= r && r <= rr[m].hi
952 return true;
953 }
954 }
955 return false;
956}
957
958CharClass* CharClassBuilder::GetCharClass() {
Peter Kastingdbb70002015-07-21 15:07:34 -0700959 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
960 int n = 0;
Russ Cox34d900b2010-05-10 16:03:39 -0700961 for (iterator it = begin(); it != end(); ++it)
962 cc->ranges_[n++] = *it;
963 cc->nranges_ = n;
Peter Kastingdbb70002015-07-21 15:07:34 -0700964 DCHECK_LE(n, static_cast<int>(ranges_.size()));
Russ Cox34d900b2010-05-10 16:03:39 -0700965 cc->nrunes_ = nrunes_;
966 cc->folds_ascii_ = FoldsASCII();
967 return cc;
968}
969
Russ Cox0a38cba2010-03-02 17:17:51 -0800970} // namespace re2