Blame - re2/compile.cc - platform/external/regex-re2

blob: 3f8e0cceb422115bc05e85addf47f938029347b6 [file] [log] [blame]

Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1	// Copyright 2007 The RE2 Authors. All Rights Reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Compile regular expression to Prog.
				6	//
				7	// Prog and Inst are defined in prog.h.
				8	// This file's external interface is just Regexp::CompileToProg.
				9	// The Compiler class defined in this file is private.
				10
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	11	#include <stdint.h>
Paul Wankadia	0029946	2016-08-02 20:26:18 +1000	[diff] [blame]	12	#include <string.h>
Paul Wankadia	a834dce	2017-02-17 16:21:05 +1100	[diff] [blame]	13	#include <unordered_map>
Paul Wankadia	0029946	2016-08-02 20:26:18 +1000	[diff] [blame]	14	#include <utility>
				15
Paul Wankadia	cc382ec	2016-08-17 01:00:16 +1000	[diff] [blame]	16	#include "util/logging.h"
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	17	#include "util/pod_array.h"
Paul Wankadia	cc382ec	2016-08-17 01:00:16 +1000	[diff] [blame]	18	#include "util/utf.h"
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	19	#include "re2/prog.h"
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	20	#include "re2/re2.h"
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	21	#include "re2/regexp.h"
				22	#include "re2/walker-inl.h"
				23
				24	namespace re2 {
				25
				26	// List of pointers to Inst* that need to be filled in (patched).
				27	// Because the Inst* haven't been filled in yet,
				28	// we can use the Inst* word to hold the list's "next" pointer.
				29	// It's kind of sleazy, but it works well in practice.
				30	// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	31	//
				32	// Because the out and out1 fields in Inst are no longer pointers,
				33	// we can't use pointers directly here either. Instead, p refers
				34	// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
				35	// p == 0 represents the NULL list. This is okay because instruction #0
				36	// is always the fail instruction, which never appears on a list.
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	37
				38	struct PatchList {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	39	uint32_t p;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	40
				41	// Returns patch list containing just p.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	42	static PatchList Mk(uint32_t p);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	43
				44	// Patches all the entries on l to have value v.
				45	// Caller must not ever use patch list again.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	46	static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	47
				48	// Deref returns the next pointer pointed at by p.
				49	static PatchList Deref(Prog::Inst *inst0, PatchList l);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	50
				51	// Appends two patch lists and returns result.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	52	static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	53	};
				54
Russ Cox	cc56ba0	2014-01-10 10:19:39 -0500	[diff] [blame]	55	static PatchList nullPatchList = { 0 };
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	56
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	57	// Returns patch list containing just p.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	58	PatchList PatchList::Mk(uint32_t p) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	59	PatchList l;
				60	l.p = p;
				61	return l;
				62	}
				63
				64	// Returns the next pointer pointed at by l.
				65	PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
				66	Prog::Inst* ip = &inst0[l.p>>1];
				67	if (l.p&1)
				68	l.p = ip->out1();
				69	else
				70	l.p = ip->out();
				71	return l;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	72	}
				73
				74	// Patches all the entries on l to have value v.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	75	void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	76	while (l.p != 0) {
				77	Prog::Inst* ip = &inst0[l.p>>1];
				78	if (l.p&1) {
				79	l.p = ip->out1();
				80	ip->out1_ = val;
				81	} else {
				82	l.p = ip->out();
				83	ip->set_out(val);
				84	}
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	85	}
				86	}
				87
				88	// Appends two patch lists and returns result.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	89	PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
				90	if (l1.p == 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	91	return l2;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	92	if (l2.p == 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	93	return l1;
				94
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	95	PatchList l = l1;
				96	for (;;) {
				97	PatchList next = PatchList::Deref(inst0, l);
				98	if (next.p == 0)
				99	break;
				100	l = next;
				101	}
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	102
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	103	Prog::Inst* ip = &inst0[l.p>>1];
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	104	if (l.p&1)
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	105	ip->out1_ = l2.p;
				106	else
				107	ip->set_out(l2.p);
				108
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	109	return l1;
				110	}
				111
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	112	// Compiled program fragment.
				113	struct Frag {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	114	uint32_t begin;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	115	PatchList end;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	116
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	117	Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	118	Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	119	};
				120
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	121	// Input encodings.
				122	enum Encoding {
				123	kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	124	kEncodingLatin1, // Latin-1 (0-FF)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	125	};
				126
				127	class Compiler : public Regexp::Walker<Frag> {
				128	public:
				129	explicit Compiler();
				130	~Compiler();
				131
				132	// Compiles Regexp to a new Prog.
				133	// Caller is responsible for deleting Prog when finished with it.
				134	// If reversed is true, compiles for walking over the input
				135	// string backward (reverses all concatenations).
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	136	static Prog Compile(Regexp re, bool reversed, int64_t max_mem);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	137
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	138	// Compiles alternation of all the re to a new Prog.
				139	// Each re has a match with an id equal to its index in the vector.
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	140	static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	141
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	142	// Interface for Regexp::Walker, which helps traverse the Regexp.
				143	// The walk is purely post-recursive: given the machines for the
				144	// children, PostVisit combines them to create the machine for
				145	// the current node. The child_args are Frags.
				146	// The Compiler traverses the Regexp parse tree, visiting
				147	// each node in depth-first order. It invokes PreVisit before
				148	// visiting the node's children and PostVisit after visiting
				149	// the children.
				150	Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
				151	Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
				152	int nchild_args);
				153	Frag ShortVisit(Regexp* re, Frag parent_arg);
				154	Frag Copy(Frag arg);
				155
				156	// Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
				157	Frag Plus(Frag a, bool nongreedy);
				158	Frag Star(Frag a, bool nongreedy);
				159	Frag Quest(Frag a, bool nongreedy);
				160
				161	// Given fragment a, returns (a) capturing as \n.
				162	Frag Capture(Frag a, int n);
				163
				164	// Given fragments a and b, returns ab; a\|b
				165	Frag Cat(Frag a, Frag b);
				166	Frag Alt(Frag a, Frag b);
				167
				168	// Returns a fragment that can't match anything.
				169	Frag NoMatch();
				170
				171	// Returns a fragment that matches the empty string.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	172	Frag Match(int32_t id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	173
				174	// Returns a no-op fragment.
				175	Frag Nop();
				176
				177	// Returns a fragment matching the byte range lo-hi.
				178	Frag ByteRange(int lo, int hi, bool foldcase);
				179
				180	// Returns a fragment matching an empty-width special op.
				181	Frag EmptyWidth(EmptyOp op);
				182
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	183	// Adds n instructions to the program.
				184	// Returns the index of the first one.
				185	// Returns -1 if no more instructions are available.
				186	int AllocInst(int n);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	187
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	188	// Rune range compiler.
				189
				190	// Begins a new alternation.
				191	void BeginRange();
				192
				193	// Adds a fragment matching the rune range lo-hi.
				194	void AddRuneRange(Rune lo, Rune hi, bool foldcase);
				195	void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
				196	void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
				197	void Add_80_10ffff();
				198
				199	// New suffix that matches the byte range lo-hi, then goes to next.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	200	int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
				201	int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	202
				203	// Returns true iff the suffix is cached.
				204	bool IsCachedRuneByteSuffix(int id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	205
				206	// Adds a suffix to alternation.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	207	void AddSuffix(int id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	208
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	209	// Adds a suffix to the trie starting from the given root node.
				210	// Returns zero iff allocating an instruction fails. Otherwise, returns
				211	// the current root node, which might be different from what was given.
				212	int AddSuffixRecursive(int root, int id);
				213
				214	// Finds the trie node for the given suffix. Returns a Frag in order to
				215	// distinguish between pointing at the root node directly (end.p == 0)
				216	// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
				217	Frag FindByteRange(int root, int id);
				218
				219	// Compares two ByteRanges and returns true iff they are equal.
				220	bool ByteRangeEqual(int id1, int id2);
				221
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	222	// Returns the alternation of all the added suffixes.
				223	Frag EndRange();
				224
				225	// Single rune.
				226	Frag Literal(Rune r, bool foldcase);
				227
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	228	void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	229	Prog* Finish();
				230
				231	// Returns .* where dot = any byte
				232	Frag DotStar();
				233
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	234	private:
				235	Prog* prog_; // Program being built.
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	236	bool failed_; // Did we give up compiling?
				237	Encoding encoding_; // Input encoding
				238	bool reversed_; // Should program run backward over text?
				239
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	240	PODArray<Prog::Inst> inst_;
				241	int ninst_; // Number of instructions used.
				242	int max_ninst_; // Maximum number of instructions.
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	243
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	244	int64_t max_mem_; // Total memory budget.
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	245
Paul Wankadia	a834dce	2017-02-17 16:21:05 +1100	[diff] [blame]	246	std::unordered_map<uint64_t, int> rune_cache_;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	247	Frag rune_range_;
				248
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	249	RE2::Anchor anchor_; // anchor mode for RE2::Set
				250
Paul Wankadia	f408be0	2016-08-16 23:42:11 +1000	[diff] [blame]	251	Compiler(const Compiler&) = delete;
				252	Compiler& operator=(const Compiler&) = delete;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	253	};
				254
				255	Compiler::Compiler() {
				256	prog_ = new Prog();
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	257	failed_ = false;
				258	encoding_ = kEncodingUTF8;
				259	reversed_ = false;
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	260	ninst_ = 0;
				261	max_ninst_ = 1; // make AllocInst for fail instruction okay
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	262	max_mem_ = 0;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	263	int fail = AllocInst(1);
				264	inst_[fail].InitFail();
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	265	max_ninst_ = 0; // Caller must change
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	266	}
				267
				268	Compiler::~Compiler() {
Russ Cox	f5ccd19	2010-03-19 21:56:07 -0700	[diff] [blame]	269	delete prog_;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	270	}
				271
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	272	int Compiler::AllocInst(int n) {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	273	if (failed_ \|\| ninst_ + n > max_ninst_) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	274	failed_ = true;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	275	return -1;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	276	}
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	277
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	278	if (ninst_ + n > inst_.size()) {
				279	int cap = inst_.size();
				280	if (cap == 0)
				281	cap = 8;
				282	while (ninst_ + n > cap)
				283	cap *= 2;
				284	PODArray<Prog::Inst> inst(cap);
				285	if (inst_.data() != NULL)
				286	memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
				287	memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
				288	inst_ = std::move(inst);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	289	}
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	290	int id = ninst_;
				291	ninst_ += n;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	292	return id;
				293	}
				294
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	295	// These routines are somewhat hard to visualize in text --
				296	// see http://swtch.com/~rsc/regexp/regexp1.html for
				297	// pictures explaining what is going on here.
				298
				299	// Returns an unmatchable fragment.
				300	Frag Compiler::NoMatch() {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	301	return Frag(0, nullPatchList);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	302	}
				303
				304	// Is a an unmatchable fragment?
				305	static bool IsNoMatch(Frag a) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	306	return a.begin == 0;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	307	}
				308
				309	// Given fragments a and b, returns fragment for ab.
				310	Frag Compiler::Cat(Frag a, Frag b) {
				311	if (IsNoMatch(a) \|\| IsNoMatch(b))
				312	return NoMatch();
				313
				314	// Elide no-op.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	315	Prog::Inst* begin = &inst_[a.begin];
				316	if (begin->opcode() == kInstNop &&
				317	a.end.p == (a.begin << 1) &&
				318	begin->out() == 0) {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	319	// in case refs to a somewhere
				320	PatchList::Patch(inst_.data(), a.end, b.begin);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	321	return b;
				322	}
				323
				324	// To run backward over string, reverse all concatenations.
				325	if (reversed_) {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	326	PatchList::Patch(inst_.data(), b.end, a.begin);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	327	return Frag(b.begin, a.end);
				328	}
				329
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	330	PatchList::Patch(inst_.data(), a.end, b.begin);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	331	return Frag(a.begin, b.end);
				332	}
				333
				334	// Given fragments for a and b, returns fragment for a\|b.
				335	Frag Compiler::Alt(Frag a, Frag b) {
				336	// Special case for convenience in loops.
				337	if (IsNoMatch(a))
				338	return b;
				339	if (IsNoMatch(b))
				340	return a;
				341
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	342	int id = AllocInst(1);
				343	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	344	return NoMatch();
				345
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	346	inst_[id].InitAlt(a.begin, b.begin);
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	347	return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	348	}
				349
				350	// When capturing submatches in like-Perl mode, a kOpAlt Inst
				351	// treats out_ as the first choice, out1_ as the second.
				352	//
				353	// For *, +, and ?, if out_ causes another repetition,
				354	// then the operator is greedy. If out1_ is the repetition
				355	// (and out_ moves forward), then the operator is non-greedy.
				356
				357	// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
				358	Frag Compiler::Star(Frag a, bool nongreedy) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	359	int id = AllocInst(1);
				360	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	361	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	362	inst_[id].InitAlt(0, 0);
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	363	PatchList::Patch(inst_.data(), a.end, id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	364	if (nongreedy) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	365	inst_[id].out1_ = a.begin;
				366	return Frag(id, PatchList::Mk(id << 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	367	} else {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	368	inst_[id].set_out(a.begin);
				369	return Frag(id, PatchList::Mk((id << 1) \| 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	370	}
				371	}
				372
				373	// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
				374	Frag Compiler::Plus(Frag a, bool nongreedy) {
				375	// a+ is just a* with a different entry point.
				376	Frag f = Star(a, nongreedy);
				377	return Frag(a.begin, f.end);
				378	}
				379
				380	// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
				381	Frag Compiler::Quest(Frag a, bool nongreedy) {
Russ Cox	33d2a42	2014-01-09 22:31:48 -0500	[diff] [blame]	382	if (IsNoMatch(a))
				383	return Nop();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	384	int id = AllocInst(1);
				385	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	386	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	387	PatchList pl;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	388	if (nongreedy) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	389	inst_[id].InitAlt(0, a.begin);
				390	pl = PatchList::Mk(id << 1);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	391	} else {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	392	inst_[id].InitAlt(a.begin, 0);
				393	pl = PatchList::Mk((id << 1) \| 1);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	394	}
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	395	return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	396	}
				397
				398	// Returns a fragment for the byte range lo-hi.
				399	Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	400	int id = AllocInst(1);
				401	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	402	return NoMatch();
Russ Cox	2701a76	2011-01-02 18:52:26 -0500	[diff] [blame]	403	inst_[id].InitByteRange(lo, hi, foldcase, 0);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	404	return Frag(id, PatchList::Mk(id << 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	405	}
				406
				407	// Returns a no-op fragment. Sometimes unavoidable.
				408	Frag Compiler::Nop() {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	409	int id = AllocInst(1);
				410	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	411	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	412	inst_[id].InitNop(0);
				413	return Frag(id, PatchList::Mk(id << 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	414	}
				415
				416	// Returns a fragment that signals a match.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	417	Frag Compiler::Match(int32_t match_id) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	418	int id = AllocInst(1);
				419	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	420	return NoMatch();
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	421	inst_[id].InitMatch(match_id);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	422	return Frag(id, nullPatchList);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	423	}
				424
				425	// Returns a fragment matching a particular empty-width op (like ^ or $)
				426	Frag Compiler::EmptyWidth(EmptyOp empty) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	427	int id = AllocInst(1);
				428	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	429	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	430	inst_[id].InitEmptyWidth(empty, 0);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	431	return Frag(id, PatchList::Mk(id << 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	432	}
				433
				434	// Given a fragment a, returns a fragment with capturing parens around a.
				435	Frag Compiler::Capture(Frag a, int n) {
Russ Cox	33d2a42	2014-01-09 22:31:48 -0500	[diff] [blame]	436	if (IsNoMatch(a))
				437	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	438	int id = AllocInst(2);
				439	if (id < 0)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	440	return NoMatch();
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	441	inst_[id].InitCapture(2*n, a.begin);
				442	inst_[id+1].InitCapture(2*n+1, 0);
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	443	PatchList::Patch(inst_.data(), a.end, id+1);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	444
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	445	return Frag(id, PatchList::Mk((id+1) << 1));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	446	}
				447
				448	// A Rune is a name for a Unicode code point.
				449	// Returns maximum rune encoded by UTF-8 sequence of length len.
				450	static int MaxRune(int len) {
Russ Cox	d2514a0	2012-05-29 11:49:16 -0400	[diff] [blame]	451	int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	452	if (len == 1)
				453	b = 7;
				454	else
				455	b = 8-(len+1) + 6*(len-1);
				456	return (1<<b) - 1; // maximum Rune for b bits.
				457	}
				458
				459	// The rune range compiler caches common suffix fragments,
				460	// which are very common in UTF-8 (e.g., [80-bf]).
				461	// The fragment suffixes are identified by their start
				462	// instructions. NULL denotes the eventual end match.
				463	// The Frag accumulates in rune_range_. Caching common
				464	// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
				465	// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
				466
				467	void Compiler::BeginRange() {
				468	rune_cache_.clear();
Russ Cox	2701a76	2011-01-02 18:52:26 -0500	[diff] [blame]	469	rune_range_.begin = 0;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	470	rune_range_.end = nullPatchList;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	471	}
				472
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	473	int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	474	int next) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	475	Frag f = ByteRange(lo, hi, foldcase);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	476	if (next != 0) {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	477	PatchList::Patch(inst_.data(), f.end, next);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	478	} else {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	479	rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	480	}
				481	return f.begin;
				482	}
				483
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	484	static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
				485	int next) {
				486	return (uint64_t)next << 17 \|
				487	(uint64_t)lo << 9 \|
				488	(uint64_t)hi << 1 \|
				489	(uint64_t)foldcase;
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	490	}
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	491
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	492	int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	493	int next) {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	494	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
Paul Wankadia	a834dce	2017-02-17 16:21:05 +1100	[diff] [blame]	495	std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	496	if (it != rune_cache_.end())
				497	return it->second;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	498	int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
				499	rune_cache_[key] = id;
				500	return id;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	501	}
				502
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	503	bool Compiler::IsCachedRuneByteSuffix(int id) {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	504	uint8_t lo = inst_[id].lo_;
				505	uint8_t hi = inst_[id].hi_;
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	506	bool foldcase = inst_[id].foldcase() != 0;
				507	int next = inst_[id].out();
				508
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	509	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	510	return rune_cache_.find(key) != rune_cache_.end();
				511	}
				512
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	513	void Compiler::AddSuffix(int id) {
Paul Wankadia	526c396	2016-03-22 15:34:37 +1100	[diff] [blame]	514	if (failed_)
				515	return;
				516
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	517	if (rune_range_.begin == 0) {
				518	rune_range_.begin = id;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	519	return;
				520	}
				521
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	522	if (encoding_ == kEncodingUTF8) {
				523	// Build a trie in order to reduce fanout.
				524	rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
				525	return;
				526	}
				527
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	528	int alt = AllocInst(1);
				529	if (alt < 0) {
				530	rune_range_.begin = 0;
				531	return;
				532	}
				533	inst_[alt].InitAlt(rune_range_.begin, id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	534	rune_range_.begin = alt;
				535	}
				536
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	537	int Compiler::AddSuffixRecursive(int root, int id) {
				538	DCHECK(inst_[root].opcode() == kInstAlt \|\|
				539	inst_[root].opcode() == kInstByteRange);
				540
				541	Frag f = FindByteRange(root, id);
				542	if (IsNoMatch(f)) {
				543	int alt = AllocInst(1);
				544	if (alt < 0)
				545	return 0;
				546	inst_[alt].InitAlt(root, id);
				547	return alt;
				548	}
				549
				550	int br;
				551	if (f.end.p == 0)
				552	br = root;
				553	else if (f.end.p&1)
				554	br = inst_[f.begin].out1();
				555	else
				556	br = inst_[f.begin].out();
				557
				558	if (IsCachedRuneByteSuffix(br)) {
				559	// We can't fiddle with cached suffixes, so make a clone of the head.
				560	int byterange = AllocInst(1);
				561	if (byterange < 0)
				562	return 0;
				563	inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
				564	inst_[br].foldcase(), inst_[br].out());
				565
				566	// Ensure that the parent points to the clone, not to the original.
				567	// Note that this could leave the head unreachable except via the cache.
				568	br = byterange;
				569	if (f.end.p == 0)
				570	root = br;
				571	else if (f.end.p&1)
				572	inst_[f.begin].out1_ = br;
				573	else
				574	inst_[f.begin].set_out(br);
				575	}
				576
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	577	int out = inst_[id].out();
				578	if (!IsCachedRuneByteSuffix(id)) {
				579	// The head should be the instruction most recently allocated, so free it
				580	// instead of leaving it unreachable.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	581	DCHECK_EQ(id, ninst_-1);
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	582	inst_[id].out_opcode_ = 0;
				583	inst_[id].out1_ = 0;
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	584	ninst_--;
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	585	}
				586
				587	out = AddSuffixRecursive(inst_[br].out(), out);
				588	if (out == 0)
				589	return 0;
				590
				591	inst_[br].set_out(out);
				592	return root;
				593	}
				594
				595	bool Compiler::ByteRangeEqual(int id1, int id2) {
				596	return inst_[id1].lo() == inst_[id2].lo() &&
				597	inst_[id1].hi() == inst_[id2].hi() &&
				598	inst_[id1].foldcase() == inst_[id2].foldcase();
				599	}
				600
				601	Frag Compiler::FindByteRange(int root, int id) {
				602	if (inst_[root].opcode() == kInstByteRange) {
				603	if (ByteRangeEqual(root, id))
				604	return Frag(root, nullPatchList);
				605	else
				606	return NoMatch();
				607	}
				608
				609	while (inst_[root].opcode() == kInstAlt) {
				610	int out1 = inst_[root].out1();
				611	if (ByteRangeEqual(out1, id))
				612	return Frag(root, PatchList::Mk((root << 1) \| 1));
				613
				614	// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
				615	// what we're looking for, then we can stop immediately. Unfortunately, we
				616	// can't short-circuit the search in reverse mode.
				617	if (!reversed_)
				618	return NoMatch();
				619
				620	int out = inst_[root].out();
				621	if (inst_[out].opcode() == kInstAlt)
				622	root = out;
				623	else if (ByteRangeEqual(out, id))
				624	return Frag(root, PatchList::Mk(root << 1));
				625	else
				626	return NoMatch();
				627	}
				628
				629	LOG(DFATAL) << "should never happen";
				630	return NoMatch();
				631	}
				632
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	633	Frag Compiler::EndRange() {
				634	return rune_range_;
				635	}
				636
				637	// Converts rune range lo-hi into a fragment that recognizes
				638	// the bytes that would make up those runes in the current
				639	// encoding (Latin 1 or UTF-8).
				640	// This lets the machine work byte-by-byte even when
				641	// using multibyte encodings.
				642
				643	void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
				644	switch (encoding_) {
				645	default:
				646	case kEncodingUTF8:
				647	AddRuneRangeUTF8(lo, hi, foldcase);
				648	break;
				649	case kEncodingLatin1:
				650	AddRuneRangeLatin1(lo, hi, foldcase);
				651	break;
				652	}
				653	}
				654
				655	void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	656	// Latin-1 is easy: runes are bytes.
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	657	if (lo > hi \|\| lo > 0xFF)
				658	return;
				659	if (hi > 0xFF)
				660	hi = 0xFF;
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	661	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
				662	static_cast<uint8_t>(hi), foldcase, 0));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	663	}
				664
				665	// Table describing how to make a UTF-8 matching machine
				666	// for the rune range 80-10FFFF (Runeself-Runemax).
				667	// This range happens frequently enough (for example /./ and /[^a-z]/)
				668	// and the rune_cache_ map is slow enough that this is worth
				669	// special handling. Makes compilation of a small expression
				670	// with a dot in it about 10% faster.
				671	// The * in the comments below mark whole sequences.
				672	static struct ByteRangeProg {
				673	int next;
				674	int lo;
				675	int hi;
				676	} prog_80_10ffff[] = {
				677	// Two-byte
				678	{ -1, 0x80, 0xBF, }, // 0: 80-BF
				679	{ 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
				680
				681	// Three-byte
				682	{ 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
				683	{ 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
				684	{ 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
				685	{ 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
				686
				687	// Four-byte
				688	{ 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
				689	{ 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
				690	{ 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
				691	{ 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
				692	{ 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
				693	{ 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
				694	};
				695
				696	void Compiler::Add_80_10ffff() {
Russ Cox	1deddeb	2012-08-21 07:35:28 -0700	[diff] [blame]	697	int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	698	for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
				699	const ByteRangeProg& p = prog_80_10ffff[i];
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	700	int next = 0;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	701	if (p.next >= 0)
				702	next = inst[p.next];
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	703	inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
				704	static_cast<uint8_t>(p.hi), false, next);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	705	if ((p.lo & 0xC0) != 0x80)
				706	AddSuffix(inst[i]);
				707	}
				708	}
				709
				710	void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
				711	if (lo > hi)
				712	return;
				713
				714	// Pick off 80-10FFFF as a common special case
				715	// that can bypass the slow rune_cache_.
				716	if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
				717	Add_80_10ffff();
				718	return;
				719	}
				720
				721	// Split range into same-length sized ranges.
				722	for (int i = 1; i < UTFmax; i++) {
				723	Rune max = MaxRune(i);
				724	if (lo <= max && max < hi) {
				725	AddRuneRangeUTF8(lo, max, foldcase);
				726	AddRuneRangeUTF8(max+1, hi, foldcase);
				727	return;
				728	}
				729	}
				730
				731	// ASCII range is always a special case.
				732	if (hi < Runeself) {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	733	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
				734	static_cast<uint8_t>(hi), foldcase, 0));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	735	return;
				736	}
				737
				738	// Split range into sections that agree on leading bytes.
				739	for (int i = 1; i < UTFmax; i++) {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	740	uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	741	if ((lo & ~m) != (hi & ~m)) {
				742	if ((lo & m) != 0) {
				743	AddRuneRangeUTF8(lo, lo\|m, foldcase);
				744	AddRuneRangeUTF8((lo\|m)+1, hi, foldcase);
				745	return;
				746	}
				747	if ((hi & m) != m) {
				748	AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
				749	AddRuneRangeUTF8(hi&~m, hi, foldcase);
				750	return;
				751	}
				752	}
				753	}
				754
				755	// Finally. Generate byte matching equivalent for lo-hi.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	756	uint8_t ulo[UTFmax], uhi[UTFmax];
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	757	int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
				758	int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	759	(void)m; // USED(m)
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	760	DCHECK_EQ(n, m);
				761
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	762	// The logic below encodes this thinking:
				763	//
				764	// 1. When we have built the whole suffix, we know that it cannot
				765	// possibly be a suffix of anything longer: in forward mode, nothing
				766	// else can occur before the leading byte; in reverse mode, nothing
				767	// else can occur after the last continuation byte or else the leading
				768	// byte would have to change. Thus, there is no benefit to caching
				769	// the first byte of the suffix whereas there is a cost involved in
				770	// cloning it if it begins a common prefix, which is fairly likely.
				771	//
				772	// 2. Conversely, the last byte of the suffix cannot possibly be a
				773	// prefix of anything because next == 0, so we will never want to
				774	// clone it, but it is fairly likely to be a common suffix. Perhaps
				775	// more so in reverse mode than in forward mode because the former is
				776	// "converging" towards lower entropy, but caching is still worthwhile
				777	// for the latter in cases such as 80-BF.
				778	//
				779	// 3. Handling the bytes between the first and the last is less
				780	// straightforward and, again, the approach depends on whether we are
				781	// "converging" towards lower entropy: in forward mode, a single byte
				782	// is unlikely to be part of a common suffix whereas a byte range
				783	// is more likely so; in reverse mode, a byte range is unlikely to
				784	// be part of a common suffix whereas a single byte is more likely
				785	// so. The same benefit versus cost argument applies here.
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	786	int id = 0;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	787	if (reversed_) {
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	788	for (int i = 0; i < n; i++) {
				789	// In reverse UTF-8 mode: cache the leading byte; don't cache the last
				790	// continuation byte; cache anything else iff it's a single byte (XX-XX).
				791	if (i == 0 \|\| (ulo[i] == uhi[i] && i != n-1))
				792	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
				793	else
				794	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
				795	}
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	796	} else {
Paul Wankadia	7446ad3	2016-01-25 19:39:53 +1100	[diff] [blame]	797	for (int i = n-1; i >= 0; i--) {
				798	// In forward UTF-8 mode: don't cache the leading byte; cache the last
				799	// continuation byte; cache anything else iff it's a byte range (XX-YY).
				800	if (i == n-1 \|\| (ulo[i] < uhi[i] && i != 0))
				801	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
				802	else
				803	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
				804	}
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	805	}
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	806	AddSuffix(id);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	807	}
				808
				809	// Should not be called.
				810	Frag Compiler::Copy(Frag arg) {
				811	// We're using WalkExponential; there should be no copying.
				812	LOG(DFATAL) << "Compiler::Copy called!";
				813	failed_ = true;
				814	return NoMatch();
				815	}
				816
				817	// Visits a node quickly; called once WalkExponential has
				818	// decided to cut this walk short.
				819	Frag Compiler::ShortVisit(Regexp* re, Frag) {
				820	failed_ = true;
				821	return NoMatch();
				822	}
				823
				824	// Called before traversing a node's children during the walk.
				825	Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
				826	// Cut off walk if we've already failed.
				827	if (failed_)
				828	*stop = true;
				829
Russ Cox	cc56ba0	2014-01-10 10:19:39 -0500	[diff] [blame]	830	return Frag(); // not used by caller
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	831	}
				832
				833	Frag Compiler::Literal(Rune r, bool foldcase) {
				834	switch (encoding_) {
				835	default:
Russ Cox	cc56ba0	2014-01-10 10:19:39 -0500	[diff] [blame]	836	return Frag();
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	837
				838	case kEncodingLatin1:
				839	return ByteRange(r, r, foldcase);
				840
				841	case kEncodingUTF8: {
				842	if (r < Runeself) // Make common case fast.
				843	return ByteRange(r, r, foldcase);
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	844	uint8_t buf[UTFmax];
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	845	int n = runetochar(reinterpret_cast<char*>(buf), &r);
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	846	Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	847	for (int i = 1; i < n; i++)
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	848	f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	849	return f;
				850	}
				851	}
				852	}
				853
				854	// Called after traversing the node's children during the walk.
				855	// Given their frags, build and return the frag for this re.
				856	Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
				857	int nchild_frags) {
				858	// If a child failed, don't bother going forward, especially
				859	// since the child_frags might contain Frags with NULLs in them.
				860	if (failed_)
				861	return NoMatch();
				862
				863	// Given the child fragments, return the fragment for this node.
				864	switch (re->op()) {
				865	case kRegexpRepeat:
				866	// Should not see; code at bottom of function will print error
				867	break;
				868
				869	case kRegexpNoMatch:
				870	return NoMatch();
				871
				872	case kRegexpEmptyMatch:
				873	return Nop();
				874
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	875	case kRegexpHaveMatch: {
				876	Frag f = Match(re->match_id());
Paul Wankadia	f7ad022	2017-10-04 01:47:45 +1100	[diff] [blame]	877	if (anchor_ == RE2::ANCHOR_BOTH) {
				878	// Append \z or else the subexpression will effectively be unanchored.
				879	// Complemented by the UNANCHORED case in CompileSet().
				880	f = Cat(EmptyWidth(kEmptyEndText), f);
				881	}
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	882	return f;
				883	}
				884
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	885	case kRegexpConcat: {
				886	Frag f = child_frags[0];
				887	for (int i = 1; i < nchild_frags; i++)
				888	f = Cat(f, child_frags[i]);
				889	return f;
				890	}
				891
				892	case kRegexpAlternate: {
				893	Frag f = child_frags[0];
				894	for (int i = 1; i < nchild_frags; i++)
				895	f = Alt(f, child_frags[i]);
				896	return f;
				897	}
				898
				899	case kRegexpStar:
Paul Wankadia	196ee29	2015-12-06 17:06:22 +1100	[diff] [blame]	900	return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	901
				902	case kRegexpPlus:
Paul Wankadia	196ee29	2015-12-06 17:06:22 +1100	[diff] [blame]	903	return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	904
				905	case kRegexpQuest:
Paul Wankadia	196ee29	2015-12-06 17:06:22 +1100	[diff] [blame]	906	return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	907
				908	case kRegexpLiteral:
Paul Wankadia	196ee29	2015-12-06 17:06:22 +1100	[diff] [blame]	909	return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	910
				911	case kRegexpLiteralString: {
				912	// Concatenation of literals.
				913	if (re->nrunes() == 0)
				914	return Nop();
				915	Frag f;
				916	for (int i = 0; i < re->nrunes(); i++) {
Paul Wankadia	196ee29	2015-12-06 17:06:22 +1100	[diff] [blame]	917	Frag f1 = Literal(re->runes()[i],
				918	(re->parse_flags()&Regexp::FoldCase) != 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	919	if (i == 0)
				920	f = f1;
				921	else
				922	f = Cat(f, f1);
				923	}
				924	return f;
				925	}
				926
				927	case kRegexpAnyChar:
				928	BeginRange();
				929	AddRuneRange(0, Runemax, false);
				930	return EndRange();
				931
				932	case kRegexpAnyByte:
				933	return ByteRange(0x00, 0xFF, false);
				934
				935	case kRegexpCharClass: {
				936	CharClass* cc = re->cc();
				937	if (cc->empty()) {
				938	// This can't happen.
				939	LOG(DFATAL) << "No ranges in char class";
				940	failed_ = true;
				941	return NoMatch();
				942	}
				943
				944	// ASCII case-folding optimization: if the char class
				945	// behaves the same on A-Z as it does on a-z,
				946	// discard any ranges wholly contained in A-Z
				947	// and mark the other ranges as foldascii.
				948	// This reduces the size of a program for
				949	// (?i)abc from 3 insts per letter to 1 per letter.
				950	bool foldascii = cc->FoldsASCII();
				951
				952	// Character class is just a big OR of the different
				953	// character ranges in the class.
				954	BeginRange();
				955	for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
				956	// ASCII case-folding optimization (see above).
				957	if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
				958	continue;
				959
				960	// If this range contains all of A-Za-z or none of it,
				961	// the fold flag is unnecessary; don't bother.
				962	bool fold = foldascii;
Paul Wankadia	1f8bfb6	2015-05-11 17:48:40 +1000	[diff] [blame]	963	if ((i->lo <= 'A' && 'z' <= i->hi) \|\| i->hi < 'A' \|\| 'z' < i->lo \|\|
				964	('Z' < i->lo && i->hi < 'a'))
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	965	fold = false;
				966
				967	AddRuneRange(i->lo, i->hi, fold);
				968	}
				969	return EndRange();
				970	}
				971
				972	case kRegexpCapture:
				973	// If this is a non-capturing parenthesis -- (?:foo) --
				974	// just use the inner expression.
				975	if (re->cap() < 0)
				976	return child_frags[0];
				977	return Capture(child_frags[0], re->cap());
				978
				979	case kRegexpBeginLine:
				980	return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
				981
				982	case kRegexpEndLine:
				983	return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
				984
				985	case kRegexpBeginText:
				986	return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
				987
				988	case kRegexpEndText:
				989	return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
				990
				991	case kRegexpWordBoundary:
				992	return EmptyWidth(kEmptyWordBoundary);
				993
				994	case kRegexpNoWordBoundary:
				995	return EmptyWidth(kEmptyNonWordBoundary);
				996	}
				997	LOG(DFATAL) << "Missing case in Compiler: " << re->op();
				998	failed_ = true;
				999	return NoMatch();
				1000	}
				1001
				1002	// Is this regexp required to start at the beginning of the text?
				1003	// Only approximate; can return false for complicated regexps like (\Aa\|\Ab),
				1004	// but handles (\A(a\|b)). Could use the Walker to write a more exact one.
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1005	static bool IsAnchorStart(Regexp** pre, int depth) {
				1006	Regexp* re = *pre;
				1007	Regexp* sub;
				1008	// The depth limit makes sure that we don't overflow
				1009	// the stack on a deeply nested regexp. As the comment
				1010	// above says, IsAnchorStart is conservative, so returning
				1011	// a false negative is okay. The exact limit is somewhat arbitrary.
				1012	if (re == NULL \|\| depth >= 4)
				1013	return false;
				1014	switch (re->op()) {
				1015	default:
				1016	break;
				1017	case kRegexpConcat:
				1018	if (re->nsub() > 0) {
				1019	sub = re->sub()[0]->Incref();
				1020	if (IsAnchorStart(&sub, depth+1)) {
Chris Kennelly	f79b61f	2018-12-21 15:37:42 -0500	[diff] [blame]	1021	PODArray<Regexp*> subcopy(re->nsub());
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1022	subcopy[0] = sub; // already have reference
				1023	for (int i = 1; i < re->nsub(); i++)
				1024	subcopy[i] = re->sub()[i]->Incref();
Chris Kennelly	f79b61f	2018-12-21 15:37:42 -0500	[diff] [blame]	1025	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1026	re->Decref();
				1027	return true;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1028	}
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1029	sub->Decref();
				1030	}
				1031	break;
				1032	case kRegexpCapture:
				1033	sub = re->sub()[0]->Incref();
				1034	if (IsAnchorStart(&sub, depth+1)) {
				1035	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1036	re->Decref();
				1037	return true;
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1038	}
				1039	sub->Decref();
				1040	break;
				1041	case kRegexpBeginText:
				1042	*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
				1043	re->Decref();
				1044	return true;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1045	}
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1046	return false;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1047	}
				1048
				1049	// Is this regexp required to start at the end of the text?
				1050	// Only approximate; can return false for complicated regexps like (a\z\|b\z),
				1051	// but handles ((a\|b)\z). Could use the Walker to write a more exact one.
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1052	static bool IsAnchorEnd(Regexp** pre, int depth) {
				1053	Regexp* re = *pre;
				1054	Regexp* sub;
				1055	// The depth limit makes sure that we don't overflow
				1056	// the stack on a deeply nested regexp. As the comment
				1057	// above says, IsAnchorEnd is conservative, so returning
				1058	// a false negative is okay. The exact limit is somewhat arbitrary.
				1059	if (re == NULL \|\| depth >= 4)
				1060	return false;
				1061	switch (re->op()) {
				1062	default:
				1063	break;
				1064	case kRegexpConcat:
				1065	if (re->nsub() > 0) {
				1066	sub = re->sub()[re->nsub() - 1]->Incref();
				1067	if (IsAnchorEnd(&sub, depth+1)) {
Chris Kennelly	f79b61f	2018-12-21 15:37:42 -0500	[diff] [blame]	1068	PODArray<Regexp*> subcopy(re->nsub());
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1069	subcopy[re->nsub() - 1] = sub; // already have reference
				1070	for (int i = 0; i < re->nsub() - 1; i++)
				1071	subcopy[i] = re->sub()[i]->Incref();
Chris Kennelly	f79b61f	2018-12-21 15:37:42 -0500	[diff] [blame]	1072	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1073	re->Decref();
				1074	return true;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1075	}
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1076	sub->Decref();
				1077	}
				1078	break;
				1079	case kRegexpCapture:
				1080	sub = re->sub()[0]->Incref();
				1081	if (IsAnchorEnd(&sub, depth+1)) {
				1082	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1083	re->Decref();
				1084	return true;
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1085	}
				1086	sub->Decref();
				1087	break;
				1088	case kRegexpEndText:
				1089	*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
				1090	re->Decref();
				1091	return true;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1092	}
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1093	return false;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1094	}
				1095
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	1096	void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	1097	RE2::Anchor anchor) {
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1098	prog_->set_flags(flags);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1099
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1100	if (flags & Regexp::Latin1)
				1101	encoding_ = kEncodingLatin1;
				1102	max_mem_ = max_mem;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1103	if (max_mem <= 0) {
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1104	max_ninst_ = 100000; // more than enough
Paul Wankadia	070cf98	2016-08-22 20:32:08 +1000	[diff] [blame]	1105	} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1106	// No room for anything.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1107	max_ninst_ = 0;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1108	} else {
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	1109	int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1110	// Limit instruction count so that inst->id() fits nicely in an int.
				1111	// SparseArray also assumes that the indices (inst->id()) are ints.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1112	// The call to WalkExponential uses 2*max_ninst_ below,
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1113	// and other places in the code use 2 or 3 * prog->size().
				1114	// Limiting to 2^24 should avoid overflow in those places.
				1115	// (The point of allowing more than 32 bits of memory is to
				1116	// have plenty of room for the DFA states, not to use it up
				1117	// on the program.)
				1118	if (m >= 1<<24)
				1119	m = 1<<24;
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1120
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	1121	// Inst imposes its own limit (currently bigger than 2^24 but be safe).
				1122	if (m > Prog::Inst::kMaxInst)
				1123	m = Prog::Inst::kMaxInst;
				1124
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1125	max_ninst_ = static_cast<int>(m);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1126	}
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	1127
				1128	anchor_ = anchor;
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1129	}
				1130
				1131	// Compiles re, returning program.
				1132	// Caller is responsible for deleting prog_.
				1133	// If reversed is true, compiles a program that expects
				1134	// to run over the input string backward (reverses all concatenations).
				1135	// The reversed flag is also recorded in the returned program.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	1136	Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1137	Compiler c;
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1138	c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1139	c.reversed_ = reversed;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1140
				1141	// Simplify to remove things like counted repetitions
				1142	// and character classes like \d.
				1143	Regexp* sre = re->Simplify();
				1144	if (sre == NULL)
				1145	return NULL;
				1146
				1147	// Record whether prog is anchored, removing the anchors.
				1148	// (They get in the way of other optimizations.)
Russ Cox	00ca3ec	2011-06-21 21:45:57 -0400	[diff] [blame]	1149	bool is_anchor_start = IsAnchorStart(&sre, 0);
				1150	bool is_anchor_end = IsAnchorEnd(&sre, 0);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1151
				1152	// Generate fragment for entire regexp.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1153	Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1154	sre->Decref();
				1155	if (c.failed_)
				1156	return NULL;
				1157
				1158	// Success! Finish by putting Match node at end, and record start.
				1159	// Turn off c.reversed_ (if it is set) to force the remaining concatenations
				1160	// to behave normally.
				1161	c.reversed_ = false;
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1162	all = c.Cat(all, c.Match(0));
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1163
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1164	c.prog_->set_reversed(reversed);
				1165	if (c.prog_->reversed()) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1166	c.prog_->set_anchor_start(is_anchor_end);
				1167	c.prog_->set_anchor_end(is_anchor_start);
				1168	} else {
				1169	c.prog_->set_anchor_start(is_anchor_start);
				1170	c.prog_->set_anchor_end(is_anchor_end);
				1171	}
				1172
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1173	c.prog_->set_start(all.begin);
				1174	if (!c.prog_->anchor_start()) {
				1175	// Also create unanchored version, which starts with a .*? loop.
				1176	all = c.Cat(c.DotStar(), all);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1177	}
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1178	c.prog_->set_start_unanchored(all.begin);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1179
				1180	// Hand ownership of prog_ to caller.
				1181	return c.Finish();
				1182	}
				1183
				1184	Prog* Compiler::Finish() {
				1185	if (failed_)
				1186	return NULL;
				1187
				1188	if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	1189	// No possible matches; keep Fail instruction only.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1190	ninst_ = 1;
Russ Cox	f07ea7a	2010-05-10 16:35:52 -0700	[diff] [blame]	1191	}
				1192
Paul Wankadia	33114d1	2016-05-13 15:21:31 +1000	[diff] [blame]	1193	// Hand off the array to Prog.
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1194	prog_->inst_ = std::move(inst_);
				1195	prog_->size_ = ninst_;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1196
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1197	prog_->Optimize();
Paul Wankadia	8eb6235	2016-03-16 20:41:01 +1100	[diff] [blame]	1198	prog_->Flatten();
Paul Wankadia	565fcac	2016-05-13 16:26:52 +1000	[diff] [blame]	1199	prog_->ComputeByteMap();
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1200
				1201	// Record remaining memory for DFA.
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1202	if (max_mem_ <= 0) {
				1203	prog_->set_dfa_mem(1<<20);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1204	} else {
Paul Wankadia	7906344	2017-02-13 17:49:32 +1100	[diff] [blame]	1205	int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1206	if (m < 0)
				1207	m = 0;
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1208	prog_->set_dfa_mem(m);
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1209	}
				1210
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1211	Prog* p = prog_;
				1212	prog_ = NULL;
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1213	return p;
				1214	}
				1215
				1216	// Converts Regexp to Prog.
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	1217	Prog* Regexp::CompileToProg(int64_t max_mem) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1218	return Compiler::Compile(this, false, max_mem);
				1219	}
				1220
Paul Wankadia	d877825	2016-08-07 21:44:17 +1000	[diff] [blame]	1221	Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1222	return Compiler::Compile(this, true, max_mem);
				1223	}
				1224
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1225	Frag Compiler::DotStar() {
				1226	return Star(ByteRange(0x00, 0xff, false), true);
				1227	}
				1228
				1229	// Compiles RE set to Prog.
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1230	Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1231	Compiler c;
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1232	c.Setup(re->parse_flags(), max_mem, anchor);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1233
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1234	Regexp* sre = re->Simplify();
				1235	if (sre == NULL)
				1236	return NULL;
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1237
Chris Kennelly	04452af	2018-12-18 16:04:50 -0500	[diff] [blame]	1238	Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1239	sre->Decref();
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	1240	if (c.failed_)
				1241	return NULL;
				1242
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1243	c.prog_->set_anchor_start(true);
				1244	c.prog_->set_anchor_end(true);
				1245
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	1246	if (anchor == RE2::UNANCHORED) {
Paul Wankadia	f7ad022	2017-10-04 01:47:45 +1100	[diff] [blame]	1247	// Prepend .* or else the expression will effectively be anchored.
				1248	// Complemented by the ANCHOR_BOTH case in PostVisit().
Russ Cox	4a9f4ca	2010-07-15 20:38:05 -0700	[diff] [blame]	1249	all = c.Cat(c.DotStar(), all);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1250	}
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1251	c.prog_->set_start(all.begin);
				1252	c.prog_->set_start_unanchored(all.begin);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1253
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1254	Prog* prog = c.Finish();
				1255	if (prog == NULL)
				1256	return NULL;
				1257
				1258	// Make sure DFA has enough memory to operate,
				1259	// since we're not going to fall back to the NFA.
Paul Wankadia	6c7b9af	2016-09-06 18:05:58 +1000	[diff] [blame]	1260	bool dfa_failed = false;
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1261	StringPiece sp = "hello, world";
				1262	prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
Paul Wankadia	6c7b9af	2016-09-06 18:05:58 +1000	[diff] [blame]	1263	NULL, &dfa_failed, NULL);
				1264	if (dfa_failed) {
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1265	delete prog;
				1266	return NULL;
				1267	}
				1268
				1269	return prog;
				1270	}
				1271
Paul Wankadia	8f33ff2	2017-10-11 20:43:07 +1100	[diff] [blame]	1272	Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
				1273	return Compiler::CompileSet(re, anchor, max_mem);
Russ Cox	c334dcc	2010-07-15 18:48:48 -0700	[diff] [blame]	1274	}
				1275
Russ Cox	0a38cba	2010-03-02 17:17:51 -0800	[diff] [blame]	1276	} // namespace re2