Make the NFA execution engine use Prog::EmptyFlags().
Change-Id: Iaab77132da28aaf1b43e45b1c72d2597397462c3
Reviewed-on: https://code-review.googlesource.com/c/35770
Reviewed-by: Paul Wankadia <[email protected]>
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 1a3e0af..b2d5c0a 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -95,20 +95,20 @@
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
- // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+ // context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
- void AddToThreadq(Threadq* q, int id0, int c, int flag,
+ void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
+ // context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
- // flag is the bitwise OR of Bol, Eol, etc., specifying whether
- // ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
- inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+ int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+ const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
@@ -204,9 +204,9 @@
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
-// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
-void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
+void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0) {
if (id0 == 0)
return;
@@ -318,7 +318,7 @@
stk[nstk++] = AddState(id+1);
// Continue on if we have all the right flag bits.
- if (ip->empty() & ~flag)
+ if (ip->empty() & ~Prog::EmptyFlags(context, p))
break;
a = AddState(ip->out());
goto Loop;
@@ -328,13 +328,13 @@
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
+// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
-// flag is the bitwise OR of Bol, Eol, etc., specifying whether
-// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
-int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+ const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
@@ -360,7 +360,7 @@
break;
case kInstByteRange:
- AddToThreadq(nextq, ip->out(), c, flag, p, t);
+ AddToThreadq(nextq, ip->out(), c, context, p, t);
break;
case kInstAltMatch:
@@ -500,38 +500,9 @@
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
- int wasword = 0;
-
- if (text.begin() > context.begin())
- wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
- // Check for empty-width specials.
- int flag = 0;
-
- // ^ and \A
- if (p == context.begin())
- flag |= kEmptyBeginText | kEmptyBeginLine;
- else if (p <= context.end() && p[-1] == '\n')
- flag |= kEmptyBeginLine;
-
- // $ and \z
- if (p == context.end())
- flag |= kEmptyEndText | kEmptyEndLine;
- else if (p < context.end() && p[0] == '\n')
- flag |= kEmptyEndLine;
-
- // \b and \B
- int isword = 0;
- if (p < context.end())
- isword = Prog::IsWordChar(p[0] & 0xFF);
-
- if (isword != wasword)
- flag |= kEmptyWordBoundary;
- else
- flag |= kEmptyNonWordBoundary;
-
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
@@ -541,7 +512,7 @@
else if (p < text.end())
c = p[0] & 0xFF;
- fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
+ fprintf(stderr, "%c:", c);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
@@ -552,7 +523,7 @@
}
// This is a no-op the first time around the loop because runq is empty.
- int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
+ int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
@@ -604,17 +575,14 @@
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
- isword = 0;
- } else {
- isword = Prog::IsWordChar(p[0] & 0xFF);
}
- flag = Prog::EmptyFlags(context, p);
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
- AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
+ AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
+ t);
Decref(t);
}
@@ -624,8 +592,6 @@
fprintf(stderr, "dead\n");
break;
}
-
- wasword = isword;
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)