| // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Tested by search_test.cc. |
| // |
| // Prog::SearchOnePass is an efficient implementation of |
| // regular expression search with submatch tracking for |
| // what I call "one-pass regular expressions". (An alternate |
| // name might be "backtracking-free regular expressions".) |
| // |
| // One-pass regular expressions have the property that |
| // at each input byte during an anchored match, there may be |
| // multiple alternatives but only one can proceed for any |
| // given input byte. |
| // |
| // For example, the regexp /x*yx*/ is one-pass: you read |
| // x's until a y, then you read the y, then you keep reading x's. |
| // At no point do you have to guess what to do or back up |
| // and try a different guess. |
| // |
| // On the other hand, /x*x/ is not one-pass: when you're |
| // looking at an input "x", it's not clear whether you should |
| // use it to extend the x* or as the final x. |
| // |
| // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. |
| // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. |
| // |
| // A simple intuition for identifying one-pass regular expressions |
| // is that it's always immediately obvious when a repetition ends. |
| // It must also be immediately obvious which branch of an | to take: |
| // |
| // /x(y|z)/ is one-pass, but /(xy|xz)/ is not. |
| // |
| // The NFA-based search in nfa.cc does some bookkeeping to |
| // avoid the need for backtracking and its associated exponential blowup. |
| // But if we have a one-pass regular expression, there is no |
| // possibility of backtracking, so there is no need for the |
| // extra bookkeeping. Hence, this code. |
| // |
| // On a one-pass regular expression, the NFA code in nfa.cc |
| // runs at about 1/20 of the backtracking-based PCRE speed. |
| // In contrast, the code in this file runs at about the same |
| // speed as PCRE. |
| // |
| // One-pass regular expressions get used a lot when RE is |
| // used for parsing simple strings, so it pays off to |
| // notice them and handle them efficiently. |
| // |
| // See also Anne Brüggemann-Klein and Derick Wood, |
| // "One-unambiguous regular languages", Information and Computation 142(2). |
| |
| #include <stdint.h> |
| #include <string.h> |
| #include <algorithm> |
| #include <map> |
| #include <string> |
| #include <vector> |
| |
| #include "util/util.h" |
| #include "util/logging.h" |
| #include "util/pod_array.h" |
| #include "util/sparse_set.h" |
| #include "util/strutil.h" |
| #include "util/utf.h" |
| #include "re2/prog.h" |
| #include "re2/stringpiece.h" |
| |
| // Silence "zero-sized array in struct/union" warning for OneState::action. |
| #ifdef _MSC_VER |
| #pragma warning(disable: 4200) |
| #endif |
| |
| namespace re2 { |
| |
| static const bool ExtraDebug = false; |
| |
| // The key insight behind this implementation is that the |
| // non-determinism in an NFA for a one-pass regular expression |
| // is contained. To explain what that means, first a |
| // refresher about what regular expression programs look like |
| // and how the usual NFA execution runs. |
| // |
| // In a regular expression program, only the kInstByteRange |
| // instruction processes an input byte c and moves on to the |
| // next byte in the string (it does so if c is in the given range). |
| // The kInstByteRange instructions correspond to literal characters |
| // and character classes in the regular expression. |
| // |
| // The kInstAlt instructions are used as wiring to connect the |
| // kInstByteRange instructions together in interesting ways when |
| // implementing | + and *. |
| // The kInstAlt instruction forks execution, like a goto that |
| // jumps to ip->out() and ip->out1() in parallel. Each of the |
| // resulting computation paths is called a thread. |
| // |
| // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- |
| // are interesting in their own right but like kInstAlt they don't |
| // advance the input pointer. Only kInstByteRange does. |
| // |
| // The automaton execution in nfa.cc runs all the possible |
| // threads of execution in lock-step over the input. To process |
| // a particular byte, each thread gets run until it either dies |
| // or finds a kInstByteRange instruction matching the byte. |
| // If the latter happens, the thread stops just past the |
| // kInstByteRange instruction (at ip->out()) and waits for |
| // the other threads to finish processing the input byte. |
| // Then, once all the threads have processed that input byte, |
| // the whole process repeats. The kInstAlt state instruction |
| // might create new threads during input processing, but no |
| // matter what, all the threads stop after a kInstByteRange |
| // and wait for the other threads to "catch up". |
| // Running in lock step like this ensures that the NFA reads |
| // the input string only once. |
| // |
| // Each thread maintains its own set of capture registers |
| // (the string positions at which it executed the kInstCapture |
| // instructions corresponding to capturing parentheses in the |
| // regular expression). Repeated copying of the capture registers |
| // is the main performance bottleneck in the NFA implementation. |
| // |
| // A regular expression program is "one-pass" if, no matter what |
| // the input string, there is only one thread that makes it |
| // past a kInstByteRange instruction at each input byte. This means |
| // that there is in some sense only one active thread throughout |
| // the execution. Other threads might be created during the |
| // processing of an input byte, but they are ephemeral: only one |
| // thread is left to start processing the next input byte. |
| // This is what I meant above when I said the non-determinism |
| // was "contained". |
| // |
| // To execute a one-pass regular expression program, we can build |
| // a DFA (no non-determinism) that has at most as many states as |
| // the NFA (compare this to the possibly exponential number of states |
| // in the general case). Each state records, for each possible |
| // input byte, the next state along with the conditions required |
| // before entering that state -- empty-width flags that must be true |
| // and capture operations that must be performed. It also records |
| // whether a set of conditions required to finish a match at that |
| // point in the input rather than process the next byte. |
| |
| // A state in the one-pass NFA - just an array of actions indexed |
| // by the bytemap_[] of the next input byte. (The bytemap |
| // maps next input bytes into equivalence classes, to reduce |
| // the memory footprint.) |
| struct OneState { |
| uint32_t matchcond; // conditions to match right now. |
| uint32_t action[]; |
| }; |
| |
| // The uint32_t conditions in the action are a combination of |
| // condition and capture bits and the next state. The bottom 16 bits |
| // are the condition and capture bits, and the top 16 are the index of |
| // the next state. |
| // |
| // Bits 0-5 are the empty-width flags from prog.h. |
| // Bit 6 is kMatchWins, which means the match takes |
| // priority over moving to next in a first-match search. |
| // The remaining bits mark capture registers that should |
| // be set to the current input position. The capture bits |
| // start at index 2, since the search loop can take care of |
| // cap[0], cap[1] (the overall match position). |
| // That means we can handle up to 5 capturing parens: $1 through $4, plus $0. |
| // No input position can satisfy both kEmptyWordBoundary |
| // and kEmptyNonWordBoundary, so we can use that as a sentinel |
| // instead of needing an extra bit. |
| |
| static const int kIndexShift = 16; // number of bits below index |
| static const int kEmptyShift = 6; // number of empty flags in prog.h |
| static const int kRealCapShift = kEmptyShift + 1; |
| static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; |
| |
| // Parameters used to skip over cap[0], cap[1]. |
| static const int kCapShift = kRealCapShift - 2; |
| static const int kMaxCap = kRealMaxCap + 2; |
| |
| static const uint32_t kMatchWins = 1 << kEmptyShift; |
| static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; |
| |
| static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; |
| |
| // Check, at compile time, that prog.h agrees with math above. |
| // This function is never called. |
| void OnePass_Checks() { |
| static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags, |
| "kEmptyShift disagrees with kEmptyAllFlags"); |
| // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. |
| static_assert(kMaxCap == Prog::kMaxOnePassCapture*2, |
| "kMaxCap disagrees with kMaxOnePassCapture"); |
| } |
| |
| static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) { |
| uint32_t satisfied = Prog::EmptyFlags(context, p); |
| if (cond & kEmptyAllFlags & ~satisfied) |
| return false; |
| return true; |
| } |
| |
| // Apply the capture bits in cond, saving p to the appropriate |
| // locations in cap[]. |
| static void ApplyCaptures(uint32_t cond, const char* p, |
| const char** cap, int ncap) { |
| for (int i = 2; i < ncap; i++) |
| if (cond & (1 << kCapShift << i)) |
| cap[i] = p; |
| } |
| |
| // Computes the OneState* for the given nodeindex. |
| static inline OneState* IndexToNode(uint8_t* nodes, int statesize, |
| int nodeindex) { |
| return reinterpret_cast<OneState*>(nodes + statesize*nodeindex); |
| } |
| |
| bool Prog::SearchOnePass(const StringPiece& text, |
| const StringPiece& const_context, |
| Anchor anchor, MatchKind kind, |
| StringPiece* match, int nmatch) { |
| if (anchor != kAnchored && kind != kFullMatch) { |
| LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; |
| return false; |
| } |
| |
| // Make sure we have at least cap[1], |
| // because we use it to tell if we matched. |
| int ncap = 2*nmatch; |
| if (ncap < 2) |
| ncap = 2; |
| |
| const char* cap[kMaxCap]; |
| for (int i = 0; i < ncap; i++) |
| cap[i] = NULL; |
| |
| const char* matchcap[kMaxCap]; |
| for (int i = 0; i < ncap; i++) |
| matchcap[i] = NULL; |
| |
| StringPiece context = const_context; |
| if (context.begin() == NULL) |
| context = text; |
| if (anchor_start() && context.begin() != text.begin()) |
| return false; |
| if (anchor_end() && context.end() != text.end()) |
| return false; |
| if (anchor_end()) |
| kind = kFullMatch; |
| |
| uint8_t* nodes = onepass_nodes_; |
| int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); |
| // start() is always mapped to the zeroth OneState. |
| OneState* state = IndexToNode(nodes, statesize, 0); |
| uint8_t* bytemap = bytemap_; |
| const char* bp = text.begin(); |
| const char* ep = text.end(); |
| const char* p; |
| bool matched = false; |
| matchcap[0] = bp; |
| cap[0] = bp; |
| uint32_t nextmatchcond = state->matchcond; |
| for (p = bp; p < ep; p++) { |
| int c = bytemap[*p & 0xFF]; |
| uint32_t matchcond = nextmatchcond; |
| uint32_t cond = state->action[c]; |
| |
| // Determine whether we can reach act->next. |
| // If so, advance state and nextmatchcond. |
| if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { |
| uint32_t nextindex = cond >> kIndexShift; |
| state = IndexToNode(nodes, statesize, nextindex); |
| nextmatchcond = state->matchcond; |
| } else { |
| state = NULL; |
| nextmatchcond = kImpossible; |
| } |
| |
| // This code section is carefully tuned. |
| // The goto sequence is about 10% faster than the |
| // obvious rewrite as a large if statement in the |
| // ASCIIMatchRE2 and DotMatchRE2 benchmarks. |
| |
| // Saving the match capture registers is expensive. |
| // Is this intermediate match worth thinking about? |
| |
| // Not if we want a full match. |
| if (kind == kFullMatch) |
| goto skipmatch; |
| |
| // Not if it's impossible. |
| if (matchcond == kImpossible) |
| goto skipmatch; |
| |
| // Not if the possible match is beaten by the certain |
| // match at the next byte. When this test is useless |
| // (e.g., HTTPPartialMatchRE2) it slows the loop by |
| // about 10%, but when it avoids work (e.g., DotMatchRE2), |
| // it cuts the loop execution by about 45%. |
| if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) |
| goto skipmatch; |
| |
| // Finally, the match conditions must be satisfied. |
| if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { |
| for (int i = 2; i < 2*nmatch; i++) |
| matchcap[i] = cap[i]; |
| if (nmatch > 1 && (matchcond & kCapMask)) |
| ApplyCaptures(matchcond, p, matchcap, ncap); |
| matchcap[1] = p; |
| matched = true; |
| |
| // If we're in longest match mode, we have to keep |
| // going and see if we find a longer match. |
| // In first match mode, we can stop if the match |
| // takes priority over the next state for this input byte. |
| // That bit is per-input byte and thus in cond, not matchcond. |
| if (kind == kFirstMatch && (cond & kMatchWins)) |
| goto done; |
| } |
| |
| skipmatch: |
| if (state == NULL) |
| goto done; |
| if ((cond & kCapMask) && nmatch > 1) |
| ApplyCaptures(cond, p, cap, ncap); |
| } |
| |
| // Look for match at end of input. |
| { |
| uint32_t matchcond = state->matchcond; |
| if (matchcond != kImpossible && |
| ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { |
| if (nmatch > 1 && (matchcond & kCapMask)) |
| ApplyCaptures(matchcond, p, cap, ncap); |
| for (int i = 2; i < ncap; i++) |
| matchcap[i] = cap[i]; |
| matchcap[1] = p; |
| matched = true; |
| } |
| } |
| |
| done: |
| if (!matched) |
| return false; |
| for (int i = 0; i < nmatch; i++) |
| match[i] = |
| StringPiece(matchcap[2 * i], |
| static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i])); |
| return true; |
| } |
| |
| |
| // Analysis to determine whether a given regexp program is one-pass. |
| |
| // If ip is not on workq, adds ip to work queue and returns true. |
| // If ip is already on work queue, does nothing and returns false. |
| // If ip is NULL, does nothing and returns true (pretends to add it). |
| typedef SparseSet Instq; |
| static bool AddQ(Instq *q, int id) { |
| if (id == 0) |
| return true; |
| if (q->contains(id)) |
| return false; |
| q->insert(id); |
| return true; |
| } |
| |
| struct InstCond { |
| int id; |
| uint32_t cond; |
| }; |
| |
| // Returns whether this is a one-pass program; that is, |
| // returns whether it is safe to use SearchOnePass on this program. |
| // These conditions must be true for any instruction ip: |
| // |
| // (1) for any other Inst nip, there is at most one input-free |
| // path from ip to nip. |
| // (2) there is at most one kInstByte instruction reachable from |
| // ip that matches any particular byte c. |
| // (3) there is at most one input-free path from ip to a kInstMatch |
| // instruction. |
| // |
| // This is actually just a conservative approximation: it might |
| // return false when the answer is true, when kInstEmptyWidth |
| // instructions are involved. |
| // Constructs and saves corresponding one-pass NFA on success. |
| bool Prog::IsOnePass() { |
| if (did_onepass_) |
| return onepass_nodes_ != NULL; |
| did_onepass_ = true; |
| |
| if (start() == 0) // no match |
| return false; |
| |
| // Steal memory for the one-pass NFA from the overall DFA budget. |
| // Willing to use at most 1/4 of the DFA budget (heuristic). |
| // Limit max node count to 65000 as a conservative estimate to |
| // avoid overflowing 16-bit node index in encoding. |
| int maxnodes = 2 + inst_count(kInstByteRange); |
| int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); |
| if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) |
| return false; |
| |
| // Flood the graph starting at the start state, and check |
| // that in each reachable state, each possible byte leads |
| // to a unique next state. |
| int stacksize = inst_count(kInstCapture) + |
| inst_count(kInstEmptyWidth) + |
| inst_count(kInstNop) + 1; // + 1 for start inst |
| PODArray<InstCond> stack(stacksize); |
| |
| int size = this->size(); |
| PODArray<int> nodebyid(size); // indexed by ip |
| memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); |
| |
| // Originally, nodes was a uint8_t[maxnodes*statesize], but that was |
| // unnecessarily optimistic: why allocate a large amount of memory |
| // upfront for a large program when it is unlikely to be one-pass? |
| std::vector<uint8_t> nodes; |
| |
| Instq tovisit(size), workq(size); |
| AddQ(&tovisit, start()); |
| nodebyid[start()] = 0; |
| int nalloc = 1; |
| nodes.insert(nodes.end(), statesize, 0); |
| for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { |
| int id = *it; |
| int nodeindex = nodebyid[id]; |
| OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); |
| |
| // Flood graph using manual stack, filling in actions as found. |
| // Default is none. |
| for (int b = 0; b < bytemap_range_; b++) |
| node->action[b] = kImpossible; |
| node->matchcond = kImpossible; |
| |
| workq.clear(); |
| bool matched = false; |
| int nstack = 0; |
| stack[nstack].id = id; |
| stack[nstack++].cond = 0; |
| while (nstack > 0) { |
| int id = stack[--nstack].id; |
| uint32_t cond = stack[nstack].cond; |
| |
| Loop: |
| Prog::Inst* ip = inst(id); |
| switch (ip->opcode()) { |
| default: |
| LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
| break; |
| |
| case kInstAltMatch: |
| // TODO(rsc): Ignoring kInstAltMatch optimization. |
| // Should implement it in this engine, but it's subtle. |
| DCHECK(!ip->last()); |
| // If already on work queue, (1) is violated: bail out. |
| if (!AddQ(&workq, id+1)) |
| goto fail; |
| id = id+1; |
| goto Loop; |
| |
| case kInstByteRange: { |
| int nextindex = nodebyid[ip->out()]; |
| if (nextindex == -1) { |
| if (nalloc >= maxnodes) { |
| if (ExtraDebug) |
| LOG(ERROR) << StringPrintf( |
| "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); |
| goto fail; |
| } |
| nextindex = nalloc; |
| AddQ(&tovisit, ip->out()); |
| nodebyid[ip->out()] = nalloc; |
| nalloc++; |
| nodes.insert(nodes.end(), statesize, 0); |
| // Update node because it might have been invalidated. |
| node = IndexToNode(nodes.data(), statesize, nodeindex); |
| } |
| for (int c = ip->lo(); c <= ip->hi(); c++) { |
| int b = bytemap_[c]; |
| // Skip any bytes immediately after c that are also in b. |
| while (c < 256-1 && bytemap_[c+1] == b) |
| c++; |
| uint32_t act = node->action[b]; |
| uint32_t newact = (nextindex << kIndexShift) | cond; |
| if (matched) |
| newact |= kMatchWins; |
| if ((act & kImpossible) == kImpossible) { |
| node->action[b] = newact; |
| } else if (act != newact) { |
| if (ExtraDebug) |
| LOG(ERROR) << StringPrintf( |
| "Not OnePass: conflict on byte %#x at state %d", c, *it); |
| goto fail; |
| } |
| } |
| if (ip->foldcase()) { |
| Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a'; |
| Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a'; |
| for (int c = lo; c <= hi; c++) { |
| int b = bytemap_[c]; |
| // Skip any bytes immediately after c that are also in b. |
| while (c < 256-1 && bytemap_[c+1] == b) |
| c++; |
| uint32_t act = node->action[b]; |
| uint32_t newact = (nextindex << kIndexShift) | cond; |
| if (matched) |
| newact |= kMatchWins; |
| if ((act & kImpossible) == kImpossible) { |
| node->action[b] = newact; |
| } else if (act != newact) { |
| if (ExtraDebug) |
| LOG(ERROR) << StringPrintf( |
| "Not OnePass: conflict on byte %#x at state %d", c, *it); |
| goto fail; |
| } |
| } |
| } |
| |
| if (ip->last()) |
| break; |
| // If already on work queue, (1) is violated: bail out. |
| if (!AddQ(&workq, id+1)) |
| goto fail; |
| id = id+1; |
| goto Loop; |
| } |
| |
| case kInstCapture: |
| case kInstEmptyWidth: |
| case kInstNop: |
| if (!ip->last()) { |
| // If already on work queue, (1) is violated: bail out. |
| if (!AddQ(&workq, id+1)) |
| goto fail; |
| stack[nstack].id = id+1; |
| stack[nstack++].cond = cond; |
| } |
| |
| if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap) |
| cond |= (1 << kCapShift) << ip->cap(); |
| if (ip->opcode() == kInstEmptyWidth) |
| cond |= ip->empty(); |
| |
| // kInstCapture and kInstNop always proceed to ip->out(). |
| // kInstEmptyWidth only sometimes proceeds to ip->out(), |
| // but as a conservative approximation we assume it always does. |
| // We could be a little more precise by looking at what c |
| // is, but that seems like overkill. |
| |
| // If already on work queue, (1) is violated: bail out. |
| if (!AddQ(&workq, ip->out())) { |
| if (ExtraDebug) |
| LOG(ERROR) << StringPrintf( |
| "Not OnePass: multiple paths %d -> %d\n", *it, ip->out()); |
| goto fail; |
| } |
| id = ip->out(); |
| goto Loop; |
| |
| case kInstMatch: |
| if (matched) { |
| // (3) is violated |
| if (ExtraDebug) |
| LOG(ERROR) << StringPrintf( |
| "Not OnePass: multiple matches from %d\n", *it); |
| goto fail; |
| } |
| matched = true; |
| node->matchcond = cond; |
| |
| if (ip->last()) |
| break; |
| // If already on work queue, (1) is violated: bail out. |
| if (!AddQ(&workq, id+1)) |
| goto fail; |
| id = id+1; |
| goto Loop; |
| |
| case kInstFail: |
| break; |
| } |
| } |
| } |
| |
| if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR). |
| LOG(ERROR) << "bytemap:\n" << DumpByteMap(); |
| LOG(ERROR) << "prog:\n" << Dump(); |
| |
| std::map<int, int> idmap; |
| for (int i = 0; i < size; i++) |
| if (nodebyid[i] != -1) |
| idmap[nodebyid[i]] = i; |
| |
| string dump; |
| for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { |
| int id = *it; |
| int nodeindex = nodebyid[id]; |
| if (nodeindex == -1) |
| continue; |
| OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); |
| StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", |
| nodeindex, id, node->matchcond); |
| for (int i = 0; i < bytemap_range_; i++) { |
| if ((node->action[i] & kImpossible) == kImpossible) |
| continue; |
| StringAppendF(&dump, " %d cond %#x -> %d id=%d\n", |
| i, node->action[i] & 0xFFFF, |
| node->action[i] >> kIndexShift, |
| idmap[node->action[i] >> kIndexShift]); |
| } |
| } |
| LOG(ERROR) << "nodes:\n" << dump; |
| } |
| |
| dfa_mem_ -= nalloc*statesize; |
| onepass_nodes_ = new uint8_t[nalloc*statesize]; |
| memmove(onepass_nodes_, nodes.data(), nalloc*statesize); |
| return true; |
| |
| fail: |
| return false; |
| } |
| |
| } // namespace re2 |