blob: 278c310136eae76b5a0c38cce98b2445b619d0c7 [file] [log] [blame]
Russ Cox0a38cba2010-03-02 17:17:51 -08001// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Format a regular expression structure as a string.
Russ Cox34d900b2010-05-10 16:03:39 -07006// Tested by parse_test.cc
Russ Cox0a38cba2010-03-02 17:17:51 -08007
Paul Wankadia00299462016-08-02 20:26:18 +10008#include <string.h>
9#include <string>
10
Russ Cox0a38cba2010-03-02 17:17:51 -080011#include "util/util.h"
Paul Wankadiacc382ec2016-08-17 01:00:16 +100012#include "util/logging.h"
Paul Wankadia95ce0cc2016-08-16 23:11:37 +100013#include "util/strutil.h"
Paul Wankadiacc382ec2016-08-17 01:00:16 +100014#include "util/utf.h"
Russ Cox0a38cba2010-03-02 17:17:51 -080015#include "re2/regexp.h"
16#include "re2/walker-inl.h"
17
18namespace re2 {
19
20enum {
21 PrecAtom,
22 PrecUnary,
23 PrecConcat,
24 PrecAlternate,
25 PrecEmpty,
26 PrecParen,
27 PrecToplevel,
28};
29
30// Helper function. See description below.
31static void AppendCCRange(string* t, Rune lo, Rune hi);
32
33// Walker to generate string in s_.
34// The arg pointers are actually integers giving the
35// context precedence.
36// The child_args are always NULL.
37class ToStringWalker : public Regexp::Walker<int> {
38 public:
39 explicit ToStringWalker(string* t) : t_(t) {}
40
41 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
42 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
43 int* child_args, int nchild_args);
44 virtual int ShortVisit(Regexp* re, int parent_arg) {
45 return 0;
46 }
47
48 private:
49 string* t_; // The string the walker appends to.
50
Paul Wankadiaf408be02016-08-16 23:42:11 +100051 ToStringWalker(const ToStringWalker&) = delete;
52 ToStringWalker& operator=(const ToStringWalker&) = delete;
Russ Cox0a38cba2010-03-02 17:17:51 -080053};
54
55string Regexp::ToString() {
56 string t;
57 ToStringWalker w(&t);
58 w.WalkExponential(this, PrecToplevel, 100000);
59 if (w.stopped_early())
60 t += " [truncated]";
61 return t;
62}
63
64#define ToString DontCallToString // Avoid accidental recursion.
65
66// Visits re before children are processed.
67// Appends ( if needed and passes new precedence to children.
68int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
69 int prec = parent_arg;
70 int nprec = PrecAtom;
71
72 switch (re->op()) {
73 case kRegexpNoMatch:
74 case kRegexpEmptyMatch:
75 case kRegexpLiteral:
76 case kRegexpAnyChar:
77 case kRegexpAnyByte:
78 case kRegexpBeginLine:
79 case kRegexpEndLine:
80 case kRegexpBeginText:
81 case kRegexpEndText:
82 case kRegexpWordBoundary:
83 case kRegexpNoWordBoundary:
84 case kRegexpCharClass:
Russ Cox4a9f4ca2010-07-15 20:38:05 -070085 case kRegexpHaveMatch:
Russ Cox0a38cba2010-03-02 17:17:51 -080086 nprec = PrecAtom;
87 break;
88
89 case kRegexpConcat:
90 case kRegexpLiteralString:
91 if (prec < PrecConcat)
92 t_->append("(?:");
93 nprec = PrecConcat;
94 break;
95
96 case kRegexpAlternate:
97 if (prec < PrecAlternate)
98 t_->append("(?:");
99 nprec = PrecAlternate;
100 break;
101
102 case kRegexpCapture:
103 t_->append("(");
Paul Wankadiafcdcf252015-08-01 13:00:00 +1000104 if (re->cap() == 0)
105 LOG(DFATAL) << "kRegexpCapture cap() == 0";
Russ Cox0a38cba2010-03-02 17:17:51 -0800106 if (re->name()) {
107 t_->append("?P<");
108 t_->append(*re->name());
109 t_->append(">");
110 }
111 nprec = PrecParen;
112 break;
113
114 case kRegexpStar:
115 case kRegexpPlus:
116 case kRegexpQuest:
117 case kRegexpRepeat:
118 if (prec < PrecUnary)
119 t_->append("(?:");
120 // The subprecedence here is PrecAtom instead of PrecUnary
121 // because PCRE treats two unary ops in a row as a parse error.
122 nprec = PrecAtom;
123 break;
124 }
125
126 return nprec;
127}
128
129static void AppendLiteral(string *t, Rune r, bool foldcase) {
130 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
131 t->append(1, '\\');
Peter Kasting0fde84a2014-12-09 16:33:28 -0800132 t->append(1, static_cast<char>(r));
Russ Cox0a38cba2010-03-02 17:17:51 -0800133 } else if (foldcase && 'a' <= r && r <= 'z') {
Paul Wankadia1bcbb5f2017-04-24 14:58:20 +1000134 r -= 'a' - 'A';
Russ Cox0a38cba2010-03-02 17:17:51 -0800135 t->append(1, '[');
Peter Kasting0fde84a2014-12-09 16:33:28 -0800136 t->append(1, static_cast<char>(r));
137 t->append(1, static_cast<char>(r) + 'a' - 'A');
Russ Cox0a38cba2010-03-02 17:17:51 -0800138 t->append(1, ']');
139 } else {
140 AppendCCRange(t, r, r);
141 }
142}
143
144// Visits re after children are processed.
145// For childless regexps, all the work is done here.
146// For regexps with children, append any unary suffixes or ).
147int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
148 int* child_args, int nchild_args) {
149 int prec = parent_arg;
150 switch (re->op()) {
151 case kRegexpNoMatch:
152 // There's no simple symbol for "no match", but
153 // [^0-Runemax] excludes everything.
154 t_->append("[^\\x00-\\x{10ffff}]");
155 break;
156
157 case kRegexpEmptyMatch:
158 // Append (?:) to make empty string visible,
159 // unless this is already being parenthesized.
160 if (prec < PrecEmpty)
161 t_->append("(?:)");
162 break;
163
164 case kRegexpLiteral:
Paul Wankadia196ee292015-12-06 17:06:22 +1100165 AppendLiteral(t_, re->rune(),
166 (re->parse_flags() & Regexp::FoldCase) != 0);
Russ Cox0a38cba2010-03-02 17:17:51 -0800167 break;
168
169 case kRegexpLiteralString:
170 for (int i = 0; i < re->nrunes(); i++)
Paul Wankadia196ee292015-12-06 17:06:22 +1100171 AppendLiteral(t_, re->runes()[i],
172 (re->parse_flags() & Regexp::FoldCase) != 0);
Russ Cox0a38cba2010-03-02 17:17:51 -0800173 if (prec < PrecConcat)
174 t_->append(")");
175 break;
176
177 case kRegexpConcat:
178 if (prec < PrecConcat)
179 t_->append(")");
180 break;
181
182 case kRegexpAlternate:
183 // Clumsy but workable: the children all appended |
184 // at the end of their strings, so just remove the last one.
185 if ((*t_)[t_->size()-1] == '|')
186 t_->erase(t_->size()-1);
187 else
188 LOG(DFATAL) << "Bad final char: " << t_;
189 if (prec < PrecAlternate)
190 t_->append(")");
191 break;
192
193 case kRegexpStar:
194 t_->append("*");
195 if (re->parse_flags() & Regexp::NonGreedy)
196 t_->append("?");
197 if (prec < PrecUnary)
198 t_->append(")");
199 break;
200
201 case kRegexpPlus:
202 t_->append("+");
203 if (re->parse_flags() & Regexp::NonGreedy)
204 t_->append("?");
205 if (prec < PrecUnary)
206 t_->append(")");
207 break;
208
209 case kRegexpQuest:
210 t_->append("?");
211 if (re->parse_flags() & Regexp::NonGreedy)
212 t_->append("?");
213 if (prec < PrecUnary)
214 t_->append(")");
215 break;
216
217 case kRegexpRepeat:
218 if (re->max() == -1)
219 t_->append(StringPrintf("{%d,}", re->min()));
220 else if (re->min() == re->max())
221 t_->append(StringPrintf("{%d}", re->min()));
222 else
223 t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
224 if (re->parse_flags() & Regexp::NonGreedy)
225 t_->append("?");
226 if (prec < PrecUnary)
227 t_->append(")");
228 break;
229
230 case kRegexpAnyChar:
231 t_->append(".");
232 break;
233
234 case kRegexpAnyByte:
235 t_->append("\\C");
236 break;
237
238 case kRegexpBeginLine:
239 t_->append("^");
240 break;
241
242 case kRegexpEndLine:
243 t_->append("$");
244 break;
245
246 case kRegexpBeginText:
247 t_->append("(?-m:^)");
248 break;
249
250 case kRegexpEndText:
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700251 if (re->parse_flags() & Regexp::WasDollar)
252 t_->append("(?-m:$)");
253 else
254 t_->append("\\z");
Russ Cox0a38cba2010-03-02 17:17:51 -0800255 break;
256
257 case kRegexpWordBoundary:
258 t_->append("\\b");
259 break;
260
261 case kRegexpNoWordBoundary:
262 t_->append("\\B");
263 break;
264
265 case kRegexpCharClass: {
266 if (re->cc()->size() == 0) {
Russ Cox0a38cba2010-03-02 17:17:51 -0800267 t_->append("[^\\x00-\\x{10ffff}]");
268 break;
269 }
270 t_->append("[");
271 // Heuristic: show class as negated if it contains the
272 // non-character 0xFFFE.
273 CharClass* cc = re->cc();
274 if (cc->Contains(0xFFFE)) {
Russ Cox34d900b2010-05-10 16:03:39 -0700275 cc = cc->Negate();
Russ Cox0a38cba2010-03-02 17:17:51 -0800276 t_->append("^");
277 }
278 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
279 AppendCCRange(t_, i->lo, i->hi);
280 if (cc != re->cc())
Russ Cox34d900b2010-05-10 16:03:39 -0700281 cc->Delete();
Russ Cox0a38cba2010-03-02 17:17:51 -0800282 t_->append("]");
283 break;
284 }
285
286 case kRegexpCapture:
287 t_->append(")");
288 break;
Russ Cox4a9f4ca2010-07-15 20:38:05 -0700289
290 case kRegexpHaveMatch:
291 // There's no syntax accepted by the parser to generate
292 // this node (it is generated by RE2::Set) so make something
293 // up that is readable but won't compile.
294 t_->append("(?HaveMatch:%d)", re->match_id());
295 break;
Russ Cox0a38cba2010-03-02 17:17:51 -0800296 }
297
298 // If the parent is an alternation, append the | for it.
299 if (prec == PrecAlternate)
300 t_->append("|");
301
302 return 0;
303}
304
305// Appends a rune for use in a character class to the string t.
306static void AppendCCChar(string* t, Rune r) {
307 if (0x20 <= r && r <= 0x7E) {
308 if (strchr("[]^-\\", r))
309 t->append("\\");
Peter Kasting0fde84a2014-12-09 16:33:28 -0800310 t->append(1, static_cast<char>(r));
Russ Cox0a38cba2010-03-02 17:17:51 -0800311 return;
312 }
313 switch (r) {
314 default:
315 break;
316
317 case '\r':
318 t->append("\\r");
319 return;
320
321 case '\t':
322 t->append("\\t");
323 return;
324
325 case '\n':
326 t->append("\\n");
327 return;
328
329 case '\f':
330 t->append("\\f");
331 return;
332 }
333
334 if (r < 0x100) {
335 StringAppendF(t, "\\x%02x", static_cast<int>(r));
336 return;
337 }
338 StringAppendF(t, "\\x{%x}", static_cast<int>(r));
339}
340
341static void AppendCCRange(string* t, Rune lo, Rune hi) {
342 if (lo > hi)
343 return;
344 AppendCCChar(t, lo);
345 if (lo < hi) {
346 t->append("-");
347 AppendCCChar(t, hi);
348 }
349}
350
351} // namespace re2