| use alloc::{vec, vec::Vec}; |
| |
| use crate::{ |
| int::{NonMaxUsize, U32}, |
| nfa::{State, StateID, NFA}, |
| pool::CachePoolGuard, |
| utf8, |
| }; |
| |
| /// A PikeVM searcher. |
| /// |
| /// A PikeVM uses the standard Thompson NFA linear time search algorithm, but |
| /// augmented to support tracking the offsets of matching capture groups. |
| #[derive(Clone, Debug)] |
| pub(crate) struct PikeVM { |
| nfa: NFA, |
| } |
| |
| impl PikeVM { |
| /// Create a new PikeVM searcher that uses the given NFA. |
| pub(crate) fn new(nfa: NFA) -> PikeVM { |
| PikeVM { nfa } |
| } |
| |
| /// Return the underlying NFA used by this PikeVM. |
| pub(crate) fn nfa(&self) -> &NFA { |
| &self.nfa |
| } |
| |
| /// Returns an iterator of non-overlapping matches in the given haystack. |
| pub(crate) fn find_iter<'r, 'h>( |
| &'r self, |
| cache: CachePoolGuard<'r>, |
| haystack: &'h [u8], |
| ) -> FindMatches<'r, 'h> { |
| FindMatches { |
| pikevm: self, |
| cache, |
| haystack, |
| at: 0, |
| slots: vec![None, None], |
| last_match_end: None, |
| } |
| } |
| |
| /// Returns an iterator of non-overlapping capture matches in the given |
| /// haystack. |
| pub(crate) fn captures_iter<'r, 'h>( |
| &'r self, |
| cache: CachePoolGuard<'r>, |
| haystack: &'h [u8], |
| ) -> CapturesMatches<'r, 'h> { |
| // OK because the NFA wouldn't have compiled if this could overflow. |
| let len = self.nfa().group_len().checked_mul(2).unwrap(); |
| CapturesMatches { |
| it: FindMatches { |
| pikevm: self, |
| cache, |
| haystack, |
| at: 0, |
| slots: vec![None; len], |
| last_match_end: None, |
| }, |
| } |
| } |
| |
| /// The implementation of standard leftmost search. |
| /// |
| /// Capturing group spans are written to `slots`, but only if requested. |
| /// `slots` can be any length. Any slot in the NFA that is activated but |
| /// which is out of bounds for the given `slots` is ignored. |
| pub(crate) fn search( |
| &self, |
| cache: &mut Cache, |
| haystack: &[u8], |
| start: usize, |
| end: usize, |
| earliest: bool, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> bool { |
| cache.setup_search(slots.len()); |
| if start > end { |
| return false; |
| } |
| // Why do we even care about this? Well, in our `slots` representation, |
| // we use usize::MAX as a sentinel to indicate "no match." This isn't |
| // problematic so long as our haystack doesn't have a maximal length. |
| // Byte slices are guaranteed by Rust to have a length that fits into |
| // isize, and so this assert should always pass. But we put it here to |
| // make our assumption explicit. |
| assert!( |
| haystack.len() < core::usize::MAX, |
| "byte slice lengths must be less than usize MAX", |
| ); |
| |
| let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| let start_id = self.nfa().start(); |
| let anchored = self.nfa().is_start_anchored(); |
| let mut matched = false; |
| // Yes, our search doesn't end at `end`, but includes it. This is |
| // necessary because matches are delayed by one byte. The delay is used |
| // to handle look-behind assertions. In the case of the PikeVM, the |
| // delay is implemented by not considering a match to exist until it |
| // is visited in `nexts`. Technically, we know a match exists in the |
| // previous iteration via `epsilon_closure`. |
| let mut at = start; |
| while at <= end { |
| // If we have no states left to visit, then there are some cases |
| // where we know we can quit early or even skip ahead. |
| if curr.set.is_empty() { |
| // We have a match so we can quit. |
| if matched { |
| break; |
| } |
| // If we're running an anchored search and we've advanced |
| // beyond the start position with no other states to try, then |
| // we will never observe a match and thus can stop. |
| if anchored && at > start { |
| break; |
| } |
| } |
| // Instead of using a hypothetical unanchored start state in the |
| // NFA (which doesn't exist, but we could add it), we actually |
| // always use its anchored starting state. As a result, when doing |
| // an unanchored search, we need to simulate our own '(?s:.)*?' |
| // prefix, to permit a match to appear anywhere. |
| // |
| // Now, we don't *have* to do things this way. We could create |
| // a proper unanchored start state in the NFA and do one |
| // `epsilon_closure` call from that starting state before the main |
| // loop here. And that is just as correct. However, it turns out to |
| // be slower than our approach here because it slightly increases |
| // the cost of processing each byte by requiring us to visit |
| // more NFA states to deal with the additional NFA states in the |
| // unanchored prefix. By simulating it explicitly here, we lower |
| // those costs substantially. The cost is itself small, but it adds |
| // up for large haystacks. |
| // |
| // In order to simulate the '(?s:.)*?' prefix---which is not |
| // greedy---we are careful not to perform an epsilon closure on |
| // the start state if we already have a match. Namely, if we |
| // did otherwise, we would never reach a terminating condition |
| // because there would always be additional states to process. |
| if !matched { |
| // Since we are adding to the 'curr' active states and since |
| // this is for the start ID, we use a slots slice that is |
| // guaranteed to have the right length but where every element |
| // is absent. This is exactly what we want, because this |
| // epsilon closure is responsible for simulating an unanchored |
| // '(?s:.)*?' prefix. It is specifically outside of any |
| // capturing groups, and thus, using slots that are always |
| // absent is correct. |
| // |
| // Note though that we can't just use `&mut []` here, since |
| // this epsilon closure may traverse through `Capture` states |
| // transitions, and thus must be able to write offsets to the |
| // slots given which are later copied to slot values in `curr`. |
| let slots = next.slot_table.all_absent(); |
| self.epsilon_closure( |
| stack, slots, curr, haystack, at, start_id, |
| ); |
| } |
| let (ch, len) = utf8::decode_lossy(&haystack[at..]); |
| if self.nexts(stack, curr, next, haystack, at, ch, len, slots) { |
| matched = true; |
| } |
| // Unless the caller asked us to return early, we need to mush |
| // on to see if we can extend our match. (But note that 'nexts' |
| // will quit right after seeing a match, as is consistent with |
| // leftmost-first match priority.) |
| if (earliest && matched) || len == 0 { |
| break; |
| } |
| core::mem::swap(curr, next); |
| next.set.clear(); |
| at += len; |
| } |
| matched |
| } |
| |
| /// Process the active states in 'curr' to find the states (written to |
| /// 'next') we should process for the next byte in the haystack. |
| /// |
| /// 'stack' is used to perform a depth first traversal of the NFA when |
| /// computing an epsilon closure. |
| /// |
| /// When a match is found, the slots for that match state (in 'curr') are |
| /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
| /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
| /// |
| /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
| /// in `haystack`. |
| /// |
| /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
| /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
| /// where `at_ch` is the replacement codepoint that results from decoding |
| /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
| fn nexts( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr: &mut ActiveStates, |
| next: &mut ActiveStates, |
| haystack: &[u8], |
| at: usize, |
| at_ch: char, |
| at_len: usize, |
| slots: &mut [Option<NonMaxUsize>], |
| ) -> bool { |
| let ActiveStates { ref set, ref mut slot_table } = *curr; |
| for sid in set.iter() { |
| if self.next( |
| stack, slot_table, next, haystack, at, at_ch, at_len, sid, |
| ) { |
| slots.copy_from_slice(slot_table.for_state(sid)); |
| return true; |
| } |
| } |
| false |
| } |
| |
| /// Starting from `sid`, if the position `at` in the `haystack` has a |
| /// transition defined out of `sid`, then add the state transitioned to and |
| /// its epsilon closure to the `next` set of states to explore. |
| /// |
| /// `stack` is used by the epsilon closure computation to perform a depth |
| /// first traversal of the NFA. |
| /// |
| /// `curr_slot_table` should be the table of slots for the current set of |
| /// states being explored. If there is a transition out of `sid`, then |
| /// sid's row in the slot table is used to perform the epsilon closure. |
| /// |
| /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
| /// in `haystack`. The caller provides it so that this routine doesn't |
| /// need to re-decode it. (Since it's expected that this routine is called |
| /// multiple times for each position.) |
| /// |
| /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
| /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
| /// where `at_ch` is the replacement codepoint that results from decoding |
| /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
| fn next( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slot_table: &mut SlotTable, |
| next: &mut ActiveStates, |
| haystack: &[u8], |
| at: usize, |
| at_ch: char, |
| at_len: usize, |
| sid: StateID, |
| ) -> bool { |
| match *self.nfa.state(sid) { |
| State::Fail |
| | State::Goto { .. } |
| | State::Splits { .. } |
| | State::Capture { .. } => false, |
| State::Char { target, ch } => { |
| if at_ch == ch && at_len > 0 { |
| let slots = curr_slot_table.for_state(sid); |
| // OK because `at_len` is always derived from the number |
| // of bytes read from `at` that make up `at_ch`. So this |
| // will never wrap. |
| let at = at.wrapping_add(at_len); |
| self.epsilon_closure( |
| stack, slots, next, haystack, at, target, |
| ); |
| } |
| false |
| } |
| State::Ranges { target, ref ranges } => { |
| for (start, end) in ranges.iter().copied() { |
| if start > at_ch { |
| break; |
| } else if start <= at_ch && at_ch <= end { |
| if at_len == 0 { |
| return false; |
| } |
| let slots = curr_slot_table.for_state(sid); |
| // OK because `at_len` is always derived from the |
| // number of bytes read from `at` that make up `at_ch`. |
| // So this will never wrap. |
| let at = at.wrapping_add(at_len); |
| self.epsilon_closure( |
| stack, slots, next, haystack, at, target, |
| ); |
| } |
| } |
| false |
| } |
| State::Match => true, |
| } |
| } |
| |
| /// Compute the epsilon closure of `sid`, writing the closure into `next` |
| /// while copying slot values from `curr_slots` into corresponding states |
| /// in `next`. `curr_slots` should be the slot values corresponding to |
| /// `sid`. |
| /// |
| /// The given `stack` is used to perform a depth first traversal of the |
| /// NFA by recursively following all epsilon transitions out of `sid`. |
| /// Conditional epsilon transitions are followed if and only if they are |
| /// satisfied for the position `at` in the `input` haystack. |
| /// |
| /// While this routine may write to `curr_slots`, once it returns, any |
| /// writes are undone and the original values (even if absent) are |
| /// restored. |
| fn epsilon_closure( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slots: &mut [Option<NonMaxUsize>], |
| next: &mut ActiveStates, |
| haystack: &[u8], |
| at: usize, |
| sid: StateID, |
| ) { |
| stack.push(FollowEpsilon::Explore(sid)); |
| while let Some(frame) = stack.pop() { |
| match frame { |
| FollowEpsilon::RestoreCapture { slot, offset } => { |
| curr_slots[slot.as_usize()] = offset; |
| } |
| FollowEpsilon::Explore(sid) => { |
| self.epsilon_closure_explore( |
| stack, curr_slots, next, haystack, at, sid, |
| ); |
| } |
| } |
| } |
| } |
| |
| /// Explore all of the epsilon transitions out of `sid`. This is mostly |
| /// split out from `epsilon_closure` in order to clearly delineate |
| /// the actual work of computing an epsilon closure from the stack |
| /// book-keeping. |
| /// |
| /// This will push any additional explorations needed on to `stack`. |
| /// |
| /// `curr_slots` should refer to the slots for the currently active NFA |
| /// state. That is, the current state we are stepping through. These |
| /// slots are mutated in place as new `Captures` states are traversed |
| /// during epsilon closure, but the slots are restored to their original |
| /// values once the full epsilon closure is completed. The ultimate use of |
| /// `curr_slots` is to copy them to the corresponding `next_slots`, so that |
| /// the capturing group spans are forwarded from the currently active state |
| /// to the next. |
| /// |
| /// `next` refers to the next set of active states. Computing an epsilon |
| /// closure may increase the next set of active states. |
| /// |
| /// `haystack` refers to the what we're searching and `at` refers to the |
| /// current position in the haystack. These are used to check whether |
| /// conditional epsilon transitions (like look-around) are satisfied at |
| /// the current position. If they aren't, then the epsilon closure won't |
| /// include them. |
| fn epsilon_closure_explore( |
| &self, |
| stack: &mut Vec<FollowEpsilon>, |
| curr_slots: &mut [Option<NonMaxUsize>], |
| next: &mut ActiveStates, |
| haystack: &[u8], |
| at: usize, |
| mut sid: StateID, |
| ) { |
| // We can avoid pushing some state IDs on to our stack in precisely |
| // the cases where a 'push(x)' would be immediately followed by a 'x |
| // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
| // to be the next state ID we want to explore once we're done with |
| // our initial exploration. In practice, this avoids a lot of stack |
| // thrashing. |
| loop { |
| // Record this state as part of our next set of active states. If |
| // we've already explored it, then no need to do it again. |
| if !next.set.insert(sid) { |
| return; |
| } |
| match *self.nfa.state(sid) { |
| State::Fail |
| | State::Match { .. } |
| | State::Char { .. } |
| | State::Ranges { .. } => { |
| next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
| return; |
| } |
| State::Goto { target, look: None } => { |
| sid = target; |
| } |
| State::Goto { target, look: Some(look) } => { |
| if !look.is_match(haystack, at) { |
| return; |
| } |
| sid = target; |
| } |
| State::Splits { ref targets, reverse: false } => { |
| sid = match targets.get(0) { |
| None => return, |
| Some(&sid) => sid, |
| }; |
| stack.extend( |
| targets[1..] |
| .iter() |
| .copied() |
| .rev() |
| .map(FollowEpsilon::Explore), |
| ); |
| } |
| State::Splits { ref targets, reverse: true } => { |
| sid = match targets.last() { |
| None => return, |
| Some(&sid) => sid, |
| }; |
| stack.extend( |
| targets[..targets.len() - 1] |
| .iter() |
| .copied() |
| .map(FollowEpsilon::Explore), |
| ); |
| } |
| State::Capture { target, slot } => { |
| // There's no need to do anything with slots that |
| // ultimately won't be copied into the caller-provided |
| // 'Captures' value. So we just skip dealing with them at |
| // all. |
| if slot.as_usize() < curr_slots.len() { |
| stack.push(FollowEpsilon::RestoreCapture { |
| slot, |
| offset: curr_slots[slot.as_usize()], |
| }); |
| // OK because length of a slice must fit into an isize. |
| curr_slots[slot.as_usize()] = |
| Some(NonMaxUsize::new(at).unwrap()); |
| } |
| sid = target; |
| } |
| } |
| } |
| } |
| } |
| |
| /// An iterator over all successive non-overlapping matches in a particular |
| /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
| /// of the cache and `'h` represents the lifetime of the haystack. |
| #[derive(Debug)] |
| pub(crate) struct FindMatches<'r, 'h> { |
| pikevm: &'r PikeVM, |
| cache: CachePoolGuard<'r>, |
| haystack: &'h [u8], |
| at: usize, |
| slots: Vec<Option<NonMaxUsize>>, |
| last_match_end: Option<usize>, |
| } |
| |
| impl<'r, 'h> Iterator for FindMatches<'r, 'h> { |
| type Item = (usize, usize); |
| |
| fn next(&mut self) -> Option<(usize, usize)> { |
| if !self.pikevm.search( |
| &mut self.cache, |
| self.haystack, |
| self.at, |
| self.haystack.len(), |
| false, |
| &mut self.slots, |
| ) { |
| return None; |
| } |
| let mut m = |
| (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
| if m.0 >= m.1 { |
| m = self.handle_overlapping_empty_match(m)?; |
| } |
| self.at = m.1; |
| self.last_match_end = Some(m.1); |
| Some(m) |
| } |
| } |
| |
| impl<'r, 'h> FindMatches<'r, 'h> { |
| /// Handles the special case of an empty match by ensuring that 1) the |
| /// iterator always advances and 2) empty matches never overlap with other |
| /// matches. |
| /// |
| /// Note that we mark this cold and forcefully prevent inlining because |
| /// handling empty matches like this is extremely rare and does require a |
| /// bit of code, comparatively. Keeping this code out of the main iterator |
| /// function keeps it smaller and more amenable to inlining itself. |
| #[cold] |
| #[inline(never)] |
| fn handle_overlapping_empty_match( |
| &mut self, |
| mut m: (usize, usize), |
| ) -> Option<(usize, usize)> { |
| assert!(m.0 >= m.1); |
| if Some(m.1) == self.last_match_end { |
| let len = |
| core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1); |
| self.at = self.at.checked_add(len).unwrap(); |
| if !self.pikevm.search( |
| &mut self.cache, |
| self.haystack, |
| self.at, |
| self.haystack.len(), |
| false, |
| &mut self.slots, |
| ) { |
| return None; |
| } |
| m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
| } |
| Some(m) |
| } |
| } |
| |
| /// An iterator over all successive non-overlapping capture matches in a particular |
| /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
| /// of the cache and `'h` represents the lifetime of the haystack. |
| #[derive(Debug)] |
| pub(crate) struct CapturesMatches<'r, 'h> { |
| it: FindMatches<'r, 'h>, |
| } |
| |
| impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { |
| type Item = Vec<Option<NonMaxUsize>>; |
| |
| fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> { |
| self.it.next()?; |
| Some(self.it.slots.clone()) |
| } |
| } |
| |
| /// A cache represents mutable state that a `PikeVM` requires during a search. |
| /// |
| /// For a given `PikeVM`, its corresponding cache may be created either via |
| /// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every |
| /// way, except the former does not require explicitly importing `Cache`. |
| /// |
| /// A particular `Cache` is coupled with the `PikeVM` from which it was |
| /// created. It may only be used with that `PikeVM`. A cache and its |
| /// allocations may be re-purposed via `Cache::reset`, in which case, it can |
| /// only be used with the new `PikeVM` (and not the old one). |
| #[derive(Clone, Debug)] |
| pub(crate) struct Cache { |
| /// Stack used while computing epsilon closure. This effectively lets us |
| /// move what is more naturally expressed through recursion to a stack |
| /// on the heap. |
| stack: Vec<FollowEpsilon>, |
| /// The current active states being explored for the current byte in the |
| /// haystack. |
| curr: ActiveStates, |
| /// The next set of states we're building that will be explored for the |
| /// next byte in the haystack. |
| next: ActiveStates, |
| } |
| |
| impl Cache { |
| /// Create a new `PikeVM` cache. |
| /// |
| /// A potentially more convenient routine to create a cache is |
| /// `PikeVM::create_cache`, as it does not require also importing the |
| /// `Cache` type. |
| /// |
| /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
| /// then you must call `Cache::reset` with the desired `PikeVM`. |
| pub(crate) fn new(re: &PikeVM) -> Cache { |
| Cache { |
| stack: vec![], |
| curr: ActiveStates::new(re), |
| next: ActiveStates::new(re), |
| } |
| } |
| |
| /// Clears this cache. This should be called at the start of every search |
| /// to ensure we start with a clean slate. |
| /// |
| /// This also sets the length of the capturing groups used in the current |
| /// search. This permits an optimization where by 'SlotTable::for_state' |
| /// only returns the number of slots equivalent to the number of slots |
| /// given in the 'Captures' value. This may be less than the total number |
| /// of possible slots, e.g., when one only wants to track overall match |
| /// offsets. This in turn permits less copying of capturing group spans |
| /// in the PikeVM. |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.stack.clear(); |
| self.curr.setup_search(captures_slot_len); |
| self.next.setup_search(captures_slot_len); |
| } |
| } |
| |
| /// A set of active states used to "simulate" the execution of an NFA via the |
| /// PikeVM. |
| /// |
| /// There are two sets of these used during NFA simulation. One set corresponds |
| /// to the "current" set of states being traversed for the current position |
| /// in a haystack. The other set corresponds to the "next" set of states being |
| /// built, which will become the new "current" set for the next position in the |
| /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
| /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
| /// |
| /// In addition to representing a set of NFA states, this also maintains slot |
| /// values for each state. These slot values are what turn the NFA simulation |
| /// into the "Pike VM." Namely, they track capturing group values for each |
| /// state. During the computation of epsilon closure, we copy slot values from |
| /// states in the "current" set to the "next" set. Eventually, once a match |
| /// is found, the slot values for that match state are what we write to the |
| /// caller provided slots. |
| #[derive(Clone, Debug)] |
| struct ActiveStates { |
| /// The set of active NFA states. This set preserves insertion order, which |
| /// is critical for simulating the match semantics of backtracking regex |
| /// engines. |
| set: SparseSet, |
| /// The slots for every NFA state, where each slot stores a (possibly |
| /// absent) offset. Every capturing group has two slots. One for a start |
| /// offset and one for an end offset. |
| slot_table: SlotTable, |
| } |
| |
| impl ActiveStates { |
| /// Create a new set of active states for the given PikeVM. The active |
| /// states returned may only be used with the given PikeVM. (Use 'reset' |
| /// to re-purpose the allocation for a different PikeVM.) |
| fn new(re: &PikeVM) -> ActiveStates { |
| let mut active = ActiveStates { |
| set: SparseSet::new(0), |
| slot_table: SlotTable::new(), |
| }; |
| active.reset(re); |
| active |
| } |
| |
| /// Reset this set of active states such that it can be used with the given |
| /// PikeVM (and only that PikeVM). |
| fn reset(&mut self, re: &PikeVM) { |
| self.set.resize(re.nfa().len()); |
| self.slot_table.reset(re); |
| } |
| |
| /// Setup this set of active states for a new search. The given slot |
| /// length should be the number of slots in a caller provided 'Captures' |
| /// (and may be zero). |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.set.clear(); |
| self.slot_table.setup_search(captures_slot_len); |
| } |
| } |
| |
| /// A table of slots, where each row represent a state in an NFA. Thus, the |
| /// table has room for storing slots for every single state in an NFA. |
| /// |
| /// This table is represented with a single contiguous allocation. In general, |
| /// the notion of "capturing group" doesn't really exist at this level of |
| /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
| /// maps to a pair of slots, one for the start offset and one for the end |
| /// offset.) Slots are indexed by the `Captures` NFA state. |
| #[derive(Clone, Debug)] |
| struct SlotTable { |
| /// The actual table of offsets. |
| table: Vec<Option<NonMaxUsize>>, |
| /// The number of slots per state, i.e., the table's stride or the length |
| /// of each row. |
| slots_per_state: usize, |
| /// The number of slots in the caller-provided `Captures` value for the |
| /// current search. Setting this to `slots_per_state` is always correct, |
| /// but may be wasteful. |
| slots_for_captures: usize, |
| } |
| |
| impl SlotTable { |
| /// Create a new slot table. |
| /// |
| /// One should call 'reset' with the corresponding PikeVM before use. |
| fn new() -> SlotTable { |
| SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
| } |
| |
| /// Reset this slot table such that it can be used with the given PikeVM |
| /// (and only that PikeVM). |
| fn reset(&mut self, re: &PikeVM) { |
| let nfa = re.nfa(); |
| // OK because NFA construction would have failed if this overflowed. |
| self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); |
| // This is always correct, but may be reduced for a particular search |
| // if fewer slots were given by the caller, e.g., none at all or only |
| // slots for tracking the overall match instead of all slots for every |
| // group. |
| self.slots_for_captures = self.slots_per_state; |
| let len = nfa |
| .len() |
| // We add 1 so that our last row is always empty. We use it as |
| // "scratch" space for computing the epsilon closure off of the |
| // starting state. |
| .checked_add(1) |
| .and_then(|x| x.checked_mul(self.slots_per_state)) |
| // It seems like this could actually panic on legitimate inputs |
| // on 32-bit targets. Should we somehow convert this to an error? |
| // What about something similar for the lazy DFA cache? If you're |
| // tripping this assert, please file a bug. |
| .expect("slot table length doesn't overflow"); |
| self.table.resize(len, None); |
| } |
| |
| /// Perform any per-search setup for this slot table. |
| /// |
| /// In particular, this sets the length of the number of slots used in the |
| /// slots given by the caller (if any at all). This number may be smaller |
| /// than the total number of slots available, e.g., when the caller is only |
| /// interested in tracking the overall match and not the spans of every |
| /// matching capturing group. Only tracking the overall match can save a |
| /// substantial amount of time copying capturing spans during a search. |
| fn setup_search(&mut self, captures_slot_len: usize) { |
| self.slots_for_captures = captures_slot_len; |
| } |
| |
| /// Return a mutable slice of the slots for the given state. |
| /// |
| /// Note that the length of the slice returned may be less than the total |
| /// number of slots available for this state. In particular, the length |
| /// always matches the number of slots indicated via `setup_search`. |
| fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
| let i = sid.as_usize() * self.slots_per_state; |
| &mut self.table[i..i + self.slots_for_captures] |
| } |
| |
| /// Return a slice of slots of appropriate length where every slot offset |
| /// is guaranteed to be absent. This is useful in cases where you need to |
| /// compute an epsilon closure outside of the user supplied regex, and thus |
| /// never want it to have any capturing slots set. |
| fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
| let i = self.table.len() - self.slots_per_state; |
| &mut self.table[i..i + self.slots_for_captures] |
| } |
| } |
| |
| /// Represents a stack frame for use while computing an epsilon closure. |
| /// |
| /// (An "epsilon closure" refers to the set of reachable NFA states from a |
| /// single state without consuming any input. That is, the set of all epsilon |
| /// transitions not only from that single state, but from every other state |
| /// reachable by an epsilon transition as well. This is why it's called a |
| /// "closure.") |
| /// |
| /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
| /// first traversal over all epsilon transitions from a particular state. |
| /// (A depth first traversal is important because it emulates the same priority |
| /// of matches that is typically found in backtracking regex engines.) This |
| /// depth first traversal is naturally expressed using recursion, but to avoid |
| /// a call stack size proportional to the size of a regex, we put our stack on |
| /// the heap instead. |
| /// |
| /// This stack thus consists of call frames. The typical call frame is |
| /// `Explore`, which instructs epsilon closure to explore the epsilon |
| /// transitions from that state. (Subsequent epsilon transitions are then |
| /// pushed on to the stack as more `Explore` frames.) If the state ID being |
| /// explored has no epsilon transitions, then the capturing group slots are |
| /// copied from the original state that sparked the epsilon closure (from the |
| /// 'step' routine) to the state ID being explored. This way, capturing group |
| /// slots are forwarded from the previous state to the next. |
| /// |
| /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
| /// set the position for a particular slot back to some particular offset. This |
| /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
| /// set the offset of the slot indicated in `Capture` to the current offset, |
| /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
| /// Thus, the new offset is only used until the epsilon closure reverts back to |
| /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
| /// transition its "scope" to only states that come "after" it during depth |
| /// first traversal. |
| #[derive(Clone, Debug)] |
| enum FollowEpsilon { |
| /// Explore the epsilon transitions from a state ID. |
| Explore(StateID), |
| /// Reset the given `slot` to the given `offset` (which might be `None`). |
| RestoreCapture { slot: u32, offset: Option<NonMaxUsize> }, |
| } |
| |
| /// A sparse set used for representing ordered NFA states. |
| /// |
| /// This supports constant time addition and membership testing. Clearing an |
| /// entire set can also be done in constant time. Iteration yields elements |
| /// in the order in which they were inserted. |
| /// |
| /// The data structure is based on: https://research.swtch.com/sparse |
| /// Note though that we don't actually use uninitialized memory. We generally |
| /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
| /// other properties listed above are extremely useful. |
| #[derive(Clone)] |
| struct SparseSet { |
| /// The number of elements currently in this set. |
| len: usize, |
| /// Dense contains the ids in the order in which they were inserted. |
| dense: Vec<StateID>, |
| /// Sparse maps ids to their location in dense. |
| /// |
| /// A state ID is in the set if and only if |
| /// sparse[id] < len && id == dense[sparse[id]]. |
| /// |
| /// Note that these are indices into 'dense'. It's a little weird to use |
| /// StateID here, but we know our length can never exceed the bounds of |
| /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
| /// where as a usize is likely double that in most cases. |
| sparse: Vec<StateID>, |
| } |
| |
| impl SparseSet { |
| /// Create a new sparse set with the given capacity. |
| /// |
| /// Sparse sets have a fixed size and they cannot grow. Attempting to |
| /// insert more distinct elements than the total capacity of the set will |
| /// result in a panic. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| fn new(capacity: usize) -> SparseSet { |
| let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
| set.resize(capacity); |
| set |
| } |
| |
| /// Resizes this sparse set to have the new capacity given. |
| /// |
| /// This set is automatically cleared. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| fn resize(&mut self, new_capacity: usize) { |
| assert!( |
| new_capacity <= u32::MAX.as_usize(), |
| "sparse set capacity cannot excced {:?}", |
| u32::MAX, |
| ); |
| self.clear(); |
| self.dense.resize(new_capacity, 0); |
| self.sparse.resize(new_capacity, 0); |
| } |
| |
| /// Returns the capacity of this set. |
| /// |
| /// The capacity represents a fixed limit on the number of distinct |
| /// elements that are allowed in this set. The capacity cannot be changed. |
| fn capacity(&self) -> usize { |
| self.dense.len() |
| } |
| |
| /// Returns the number of elements in this set. |
| fn len(&self) -> usize { |
| self.len |
| } |
| |
| /// Returns true if and only if this set is empty. |
| fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| /// Insert the state ID value into this set and return true if the given |
| /// state ID was not previously in this set. |
| /// |
| /// This operation is idempotent. If the given value is already in this |
| /// set, then this is a no-op. |
| /// |
| /// If more than `capacity` ids are inserted, then this panics. |
| fn insert(&mut self, id: StateID) -> bool { |
| if self.contains(id) { |
| return false; |
| } |
| |
| let index = self.len(); |
| assert!( |
| index < self.capacity(), |
| "{:?} exceeds capacity of {:?} when inserting {:?}", |
| index, |
| self.capacity(), |
| id, |
| ); |
| self.dense[index] = id; |
| // OK because we don't permit the capacity to be set higher than |
| // u32::MAX. |
| self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); |
| self.len += 1; |
| true |
| } |
| |
| /// Returns true if and only if this set contains the given value. |
| fn contains(&self, id: StateID) -> bool { |
| let index = self.sparse[id.as_usize()]; |
| index.as_usize() < self.len() && self.dense[index.as_usize()] == id |
| } |
| |
| /// Clear this set such that it has no members. |
| fn clear(&mut self) { |
| self.len = 0; |
| } |
| |
| /// Returns an iterator over all the state IDs in this set in the order in |
| /// which they were inserted. |
| fn iter(&self) -> SparseSetIter<'_> { |
| SparseSetIter(self.dense[..self.len()].iter()) |
| } |
| } |
| |
| impl core::fmt::Debug for SparseSet { |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| let elements: Vec<StateID> = self.iter().collect(); |
| f.debug_tuple("SparseSet").field(&elements).finish() |
| } |
| } |
| |
| /// An iterator over all elements in a sparse set. |
| /// |
| /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
| #[derive(Debug)] |
| struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
| |
| impl<'a> Iterator for SparseSetIter<'a> { |
| type Item = StateID; |
| |
| fn next(&mut self) -> Option<StateID> { |
| self.0.next().map(|&id| id) |
| } |
| } |