| /*! |
| Provides direct access to a DFA implementation of Aho-Corasick. |
| |
| This is a low-level API that generally only needs to be used in niche |
| circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
| instead of a DFA directly. Using an `DFA` directly is typically only necessary |
| when one needs access to the [`Automaton`] trait implementation. |
| */ |
| |
| use alloc::{vec, vec::Vec}; |
| |
| use crate::{ |
| automaton::Automaton, |
| nfa::noncontiguous, |
| util::{ |
| alphabet::ByteClasses, |
| error::{BuildError, MatchError}, |
| int::{Usize, U32}, |
| prefilter::Prefilter, |
| primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
| search::{Anchored, MatchKind, StartKind}, |
| special::Special, |
| }, |
| }; |
| |
| /// A DFA implementation of Aho-Corasick. |
| /// |
| /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
| /// this type directly. Using a `DFA` directly is typically only necessary when |
| /// one needs access to the [`Automaton`] trait implementation. |
| /// |
| /// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. |
| /// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but |
| /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. |
| /// |
| /// A DFA provides the best possible search performance (in this crate) via two |
| /// mechanisms: |
| /// |
| /// * All states use a dense representation for their transitions. |
| /// * All failure transitions are pre-computed such that they are never |
| /// explicitly handled at search time. |
| /// |
| /// These two facts combined mean that every state transition is performed |
| /// using a constant number of instructions. However, this comes at |
| /// great cost. The memory usage of a DFA can be quite exorbitant. |
| /// It is potentially multiple orders of magnitude greater than a |
| /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, |
| /// a DFA will typically have better search speed than a `contiguous::NFA`, but |
| /// not by orders of magnitude. |
| /// |
| /// Unless you have a small number of patterns or memory usage is not a concern |
| /// and search performance is critical, a DFA is usually not the best choice. |
| /// |
| /// Moreover, unlike the NFAs in this crate, it is costly for a DFA to |
| /// support for anchored and unanchored search configurations. Namely, |
| /// since failure transitions are pre-computed, supporting both anchored |
| /// and unanchored searches requires a duplication of the transition table, |
| /// making the memory usage of such a DFA ever bigger. (The NFAs in this crate |
| /// unconditionally support both anchored and unanchored searches because there |
| /// is essentially no added cost for doing so.) It is for this reason that |
| /// a DFA's support for anchored and unanchored searches can be configured |
| /// via [`Builder::start_kind`]. By default, a DFA only supports unanchored |
| /// searches. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to build an `DFA` directly and use it to execute |
| /// [`Automaton::try_find`]: |
| /// |
| /// ``` |
| /// use aho_corasick::{ |
| /// automaton::Automaton, |
| /// dfa::DFA, |
| /// Input, Match, |
| /// }; |
| /// |
| /// let patterns = &["b", "abc", "abcd"]; |
| /// let haystack = "abcd"; |
| /// |
| /// let nfa = DFA::new(patterns).unwrap(); |
| /// assert_eq!( |
| /// Some(Match::must(0, 1..2)), |
| /// nfa.try_find(&Input::new(haystack))?, |
| /// ); |
| /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| /// ``` |
| /// |
| /// It is also possible to implement your own version of `try_find`. See the |
| /// [`Automaton`] documentation for an example. |
| #[derive(Clone)] |
| pub struct DFA { |
| /// The DFA transition table. IDs in this table are pre-multiplied. So |
| /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, |
| /// 2*stride, 3*stride, ... |
| trans: Vec<StateID>, |
| /// The matches for every match state in this DFA. This is first indexed by |
| /// state index (so that's `sid >> stride2`) and then by order in which the |
| /// matches are meant to occur. |
| matches: Vec<Vec<PatternID>>, |
| /// The amount of heap memory used, in bytes, by the inner Vecs of |
| /// 'matches'. |
| matches_memory_usage: usize, |
| /// The length of each pattern. This is used to compute the start offset |
| /// of a match. |
| pattern_lens: Vec<SmallIndex>, |
| /// A prefilter for accelerating searches, if one exists. |
| prefilter: Option<Prefilter>, |
| /// The match semantics built into this DFA. |
| match_kind: MatchKind, |
| /// The total number of states in this DFA. |
| state_len: usize, |
| /// The alphabet size, or total number of equivalence classes, for this |
| /// DFA. Note that the actual number of transitions in each state is |
| /// stride=2^stride2, where stride is the smallest power of 2 greater than |
| /// or equal to alphabet_len. We do things this way so that we can use |
| /// bitshifting to go from a state ID to an index into 'matches'. |
| alphabet_len: usize, |
| /// The exponent with a base 2, such that stride=2^stride2. Given a state |
| /// index 'i', its state identifier is 'i << stride2'. Given a state |
| /// identifier 'sid', its state index is 'sid >> stride2'. |
| stride2: usize, |
| /// The equivalence classes for this DFA. All transitions are defined on |
| /// equivalence classes and not on the 256 distinct byte values. |
| byte_classes: ByteClasses, |
| /// The length of the shortest pattern in this automaton. |
| min_pattern_len: usize, |
| /// The length of the longest pattern in this automaton. |
| max_pattern_len: usize, |
| /// The information required to deduce which states are "special" in this |
| /// DFA. |
| special: Special, |
| } |
| |
| impl DFA { |
| /// Create a new Aho-Corasick DFA using the default configuration. |
| /// |
| /// Use a [`Builder`] if you want to change the configuration. |
| pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError> |
| where |
| I: IntoIterator<Item = P>, |
| P: AsRef<[u8]>, |
| { |
| DFA::builder().build(patterns) |
| } |
| |
| /// A convenience method for returning a new Aho-Corasick DFA builder. |
| /// |
| /// This usually permits one to just import the `DFA` type. |
| pub fn builder() -> Builder { |
| Builder::new() |
| } |
| } |
| |
| impl DFA { |
| /// A sentinel state ID indicating that a search should stop once it has |
| /// entered this state. When a search stops, it returns a match if one has |
| /// been found, otherwise no match. A DFA always has an actual dead state |
| /// at this ID. |
| /// |
| /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. |
| /// Namely, the whole point of a DFA is that the FAIL state is completely |
| /// compiled away. That is, DFA construction involves pre-computing the |
| /// failure transitions everywhere, such that failure transitions are no |
| /// longer used at search time. This, combined with its uniformly dense |
| /// representation, are the two most important factors in why it's faster |
| /// than the NFAs in this crate. |
| const DEAD: StateID = StateID::new_unchecked(0); |
| |
| /// Adds the given pattern IDs as matches to the given state and also |
| /// records the added memory usage. |
| fn set_matches( |
| &mut self, |
| sid: StateID, |
| pids: impl Iterator<Item = PatternID>, |
| ) { |
| let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); |
| let mut at_least_one = false; |
| for pid in pids { |
| self.matches[index].push(pid); |
| self.matches_memory_usage += PatternID::SIZE; |
| at_least_one = true; |
| } |
| assert!(at_least_one, "match state must have non-empty pids"); |
| } |
| } |
| |
| // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
| // returns a valid state ID given a valid state ID. We otherwise claim that |
| // all other methods are correct as well. |
| unsafe impl Automaton for DFA { |
| #[inline(always)] |
| fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
| // Either of the start state IDs can be DEAD, in which case, support |
| // for that type of search is not provided by this DFA. Which start |
| // state IDs are inactive depends on the 'StartKind' configuration at |
| // DFA construction time. |
| match anchored { |
| Anchored::No => { |
| let start = self.special.start_unanchored_id; |
| if start == DFA::DEAD { |
| Err(MatchError::invalid_input_unanchored()) |
| } else { |
| Ok(start) |
| } |
| } |
| Anchored::Yes => { |
| let start = self.special.start_anchored_id; |
| if start == DFA::DEAD { |
| Err(MatchError::invalid_input_anchored()) |
| } else { |
| Ok(start) |
| } |
| } |
| } |
| } |
| |
| #[inline(always)] |
| fn next_state( |
| &self, |
| _anchored: Anchored, |
| sid: StateID, |
| byte: u8, |
| ) -> StateID { |
| let class = self.byte_classes.get(byte); |
| self.trans[(sid.as_u32() + u32::from(class)).as_usize()] |
| } |
| |
| #[inline(always)] |
| fn is_special(&self, sid: StateID) -> bool { |
| sid <= self.special.max_special_id |
| } |
| |
| #[inline(always)] |
| fn is_dead(&self, sid: StateID) -> bool { |
| sid == DFA::DEAD |
| } |
| |
| #[inline(always)] |
| fn is_match(&self, sid: StateID) -> bool { |
| !self.is_dead(sid) && sid <= self.special.max_match_id |
| } |
| |
| #[inline(always)] |
| fn is_start(&self, sid: StateID) -> bool { |
| sid == self.special.start_unanchored_id |
| || sid == self.special.start_anchored_id |
| } |
| |
| #[inline(always)] |
| fn match_kind(&self) -> MatchKind { |
| self.match_kind |
| } |
| |
| #[inline(always)] |
| fn patterns_len(&self) -> usize { |
| self.pattern_lens.len() |
| } |
| |
| #[inline(always)] |
| fn pattern_len(&self, pid: PatternID) -> usize { |
| self.pattern_lens[pid].as_usize() |
| } |
| |
| #[inline(always)] |
| fn min_pattern_len(&self) -> usize { |
| self.min_pattern_len |
| } |
| |
| #[inline(always)] |
| fn max_pattern_len(&self) -> usize { |
| self.max_pattern_len |
| } |
| |
| #[inline(always)] |
| fn match_len(&self, sid: StateID) -> usize { |
| debug_assert!(self.is_match(sid)); |
| let offset = (sid.as_usize() >> self.stride2) - 2; |
| self.matches[offset].len() |
| } |
| |
| #[inline(always)] |
| fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
| debug_assert!(self.is_match(sid)); |
| let offset = (sid.as_usize() >> self.stride2) - 2; |
| self.matches[offset][index] |
| } |
| |
| #[inline(always)] |
| fn memory_usage(&self) -> usize { |
| use core::mem::size_of; |
| |
| (self.trans.len() * size_of::<u32>()) |
| + (self.matches.len() * size_of::<Vec<PatternID>>()) |
| + self.matches_memory_usage |
| + (self.pattern_lens.len() * size_of::<SmallIndex>()) |
| + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
| } |
| |
| #[inline(always)] |
| fn prefilter(&self) -> Option<&Prefilter> { |
| self.prefilter.as_ref() |
| } |
| } |
| |
| impl core::fmt::Debug for DFA { |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| use crate::{ |
| automaton::{fmt_state_indicator, sparse_transitions}, |
| util::debug::DebugByte, |
| }; |
| |
| writeln!(f, "dfa::DFA(")?; |
| for index in 0..self.state_len { |
| let sid = StateID::new_unchecked(index << self.stride2); |
| // While we do currently include the FAIL state in the transition |
| // table (to simplify construction), it is never actually used. It |
| // poses problems with the code below because it gets treated as |
| // a match state incidentally when it is, of course, not. So we |
| // special case it. The fail state is always the first state after |
| // the dead state. |
| // |
| // If the construction is changed to remove the fail state (it |
| // probably should be), then this special case should be updated. |
| if index == 1 { |
| writeln!(f, "F {:06}:", sid.as_usize())?; |
| continue; |
| } |
| fmt_state_indicator(f, self, sid)?; |
| write!(f, "{:06}: ", sid.as_usize())?; |
| |
| let it = (0..self.byte_classes.alphabet_len()).map(|class| { |
| (class.as_u8(), self.trans[sid.as_usize() + class]) |
| }); |
| for (i, (start, end, next)) in sparse_transitions(it).enumerate() { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| if start == end { |
| write!( |
| f, |
| "{:?} => {:?}", |
| DebugByte(start), |
| next.as_usize() |
| )?; |
| } else { |
| write!( |
| f, |
| "{:?}-{:?} => {:?}", |
| DebugByte(start), |
| DebugByte(end), |
| next.as_usize() |
| )?; |
| } |
| } |
| write!(f, "\n")?; |
| if self.is_match(sid) { |
| write!(f, " matches: ")?; |
| for i in 0..self.match_len(sid) { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| let pid = self.match_pattern(sid, i); |
| write!(f, "{}", pid.as_usize())?; |
| } |
| write!(f, "\n")?; |
| } |
| } |
| writeln!(f, "match kind: {:?}", self.match_kind)?; |
| writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; |
| writeln!(f, "state length: {:?}", self.state_len)?; |
| writeln!(f, "pattern length: {:?}", self.patterns_len())?; |
| writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; |
| writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; |
| writeln!(f, "alphabet length: {:?}", self.alphabet_len)?; |
| writeln!(f, "stride: {:?}", 1 << self.stride2)?; |
| writeln!(f, "byte classes: {:?}", self.byte_classes)?; |
| writeln!(f, "memory usage: {:?}", self.memory_usage())?; |
| writeln!(f, ")")?; |
| Ok(()) |
| } |
| } |
| |
| /// A builder for configuring an Aho-Corasick DFA. |
| /// |
| /// This builder has a subset of the options available to a |
| /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
| /// their behavior is identical. |
| #[derive(Clone, Debug)] |
| pub struct Builder { |
| noncontiguous: noncontiguous::Builder, |
| start_kind: StartKind, |
| byte_classes: bool, |
| } |
| |
| impl Default for Builder { |
| fn default() -> Builder { |
| Builder { |
| noncontiguous: noncontiguous::Builder::new(), |
| start_kind: StartKind::Unanchored, |
| byte_classes: true, |
| } |
| } |
| } |
| |
| impl Builder { |
| /// Create a new builder for configuring an Aho-Corasick DFA. |
| pub fn new() -> Builder { |
| Builder::default() |
| } |
| |
| /// Build an Aho-Corasick DFA from the given iterator of patterns. |
| /// |
| /// A builder may be reused to create more DFAs. |
| pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError> |
| where |
| I: IntoIterator<Item = P>, |
| P: AsRef<[u8]>, |
| { |
| let nnfa = self.noncontiguous.build(patterns)?; |
| self.build_from_noncontiguous(&nnfa) |
| } |
| |
| /// Build an Aho-Corasick DFA from the given noncontiguous NFA. |
| /// |
| /// Note that when this method is used, only the `start_kind` and |
| /// `byte_classes` settings on this builder are respected. The other |
| /// settings only apply to the initial construction of the Aho-Corasick |
| /// automaton. Since using this method requires that initial construction |
| /// has already completed, all settings impacting only initial construction |
| /// are no longer relevant. |
| pub fn build_from_noncontiguous( |
| &self, |
| nnfa: &noncontiguous::NFA, |
| ) -> Result<DFA, BuildError> { |
| debug!("building DFA"); |
| let byte_classes = if self.byte_classes { |
| nnfa.byte_classes().clone() |
| } else { |
| ByteClasses::singletons() |
| }; |
| let state_len = match self.start_kind { |
| StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), |
| StartKind::Both => { |
| // These unwraps are OK because we know that the number of |
| // NFA states is < StateID::LIMIT which is in turn less than |
| // i32::MAX. Thus, there is always room to multiply by 2. |
| // Finally, the number of states is always at least 4 in the |
| // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the |
| // subtraction of 4 is okay. |
| // |
| // Note that we subtract 4 because the "anchored" part of |
| // the DFA duplicates the unanchored part (without failure |
| // transitions), but reuses the DEAD, FAIL and START states. |
| nnfa.states() |
| .len() |
| .checked_mul(2) |
| .unwrap() |
| .checked_sub(4) |
| .unwrap() |
| } |
| }; |
| let trans_len = |
| match state_len.checked_shl(byte_classes.stride2().as_u32()) { |
| Some(trans_len) => trans_len, |
| None => { |
| return Err(BuildError::state_id_overflow( |
| StateID::MAX.as_u64(), |
| usize::MAX.as_u64(), |
| )) |
| } |
| }; |
| StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) |
| .map_err(|e| { |
| BuildError::state_id_overflow( |
| StateID::MAX.as_u64(), |
| e.attempted(), |
| ) |
| })?; |
| let num_match_states = match self.start_kind { |
| StartKind::Unanchored | StartKind::Anchored => { |
| nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() |
| } |
| StartKind::Both => nnfa |
| .special() |
| .max_match_id |
| .as_usize() |
| .checked_sub(1) |
| .unwrap() |
| .checked_mul(2) |
| .unwrap(), |
| }; |
| let mut dfa = DFA { |
| trans: vec![DFA::DEAD; trans_len], |
| matches: vec![vec![]; num_match_states], |
| matches_memory_usage: 0, |
| pattern_lens: nnfa.pattern_lens_raw().to_vec(), |
| prefilter: nnfa.prefilter().map(|p| p.clone()), |
| match_kind: nnfa.match_kind(), |
| state_len, |
| alphabet_len: byte_classes.alphabet_len(), |
| stride2: byte_classes.stride2(), |
| byte_classes, |
| min_pattern_len: nnfa.min_pattern_len(), |
| max_pattern_len: nnfa.max_pattern_len(), |
| // The special state IDs are set later. |
| special: Special::zero(), |
| }; |
| match self.start_kind { |
| StartKind::Both => { |
| self.finish_build_both_starts(nnfa, &mut dfa); |
| } |
| StartKind::Unanchored => { |
| self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); |
| } |
| StartKind::Anchored => { |
| self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) |
| } |
| } |
| debug!( |
| "DFA built, <states: {:?}, size: {:?}, \ |
| alphabet len: {:?}, stride: {:?}>", |
| dfa.state_len, |
| dfa.memory_usage(), |
| dfa.byte_classes.alphabet_len(), |
| dfa.byte_classes.stride(), |
| ); |
| // The vectors can grow ~twice as big during construction because a |
| // Vec amortizes growth. But here, let's shrink things back down to |
| // what we actually need since we're never going to add more to it. |
| dfa.trans.shrink_to_fit(); |
| dfa.pattern_lens.shrink_to_fit(); |
| dfa.matches.shrink_to_fit(); |
| // TODO: We might also want to shrink each Vec inside of `dfa.matches`, |
| // or even better, convert it to one contiguous allocation. But I think |
| // I went with nested allocs for good reason (can't remember), so this |
| // may be tricky to do. I decided not to shrink them here because it |
| // might require a fair bit of work to do. It's unclear whether it's |
| // worth it. |
| Ok(dfa) |
| } |
| |
| /// Finishes building a DFA for either unanchored or anchored searches, |
| /// but NOT both. |
| fn finish_build_one_start( |
| &self, |
| anchored: Anchored, |
| nnfa: &noncontiguous::NFA, |
| dfa: &mut DFA, |
| ) { |
| // This function always succeeds because we check above that all of the |
| // states in the NFA can be mapped to DFA state IDs. |
| let stride2 = dfa.stride2; |
| let old2new = |oldsid: StateID| { |
| StateID::new_unchecked(oldsid.as_usize() << stride2) |
| }; |
| for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
| let newsid = old2new(oldsid); |
| if state.is_match() { |
| dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
| } |
| sparse_iter( |
| nnfa, |
| oldsid, |
| &dfa.byte_classes, |
| |byte, class, mut oldnextsid| { |
| if oldnextsid == noncontiguous::NFA::FAIL { |
| if anchored.is_anchored() { |
| oldnextsid = noncontiguous::NFA::DEAD; |
| } else { |
| oldnextsid = nnfa.next_state( |
| Anchored::No, |
| state.fail(), |
| byte, |
| ); |
| } |
| } |
| dfa.trans[newsid.as_usize() + usize::from(class)] = |
| old2new(oldnextsid); |
| }, |
| ); |
| } |
| // Now that we've remapped all the IDs in our states, all that's left |
| // is remapping the special state IDs. |
| let old = nnfa.special(); |
| let new = &mut dfa.special; |
| new.max_special_id = old2new(old.max_special_id); |
| new.max_match_id = old2new(old.max_match_id); |
| if anchored.is_anchored() { |
| new.start_unanchored_id = DFA::DEAD; |
| new.start_anchored_id = old2new(old.start_anchored_id); |
| } else { |
| new.start_unanchored_id = old2new(old.start_unanchored_id); |
| new.start_anchored_id = DFA::DEAD; |
| } |
| } |
| |
| /// Finishes building a DFA that supports BOTH unanchored and anchored |
| /// searches. It works by inter-leaving unanchored states with anchored |
| /// states in the same transition table. This way, we avoid needing to |
| /// re-shuffle states afterward to ensure that our states still look like |
| /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... |
| /// |
| /// Honestly this is pretty inscrutable... Simplifications are most |
| /// welcome. |
| fn finish_build_both_starts( |
| &self, |
| nnfa: &noncontiguous::NFA, |
| dfa: &mut DFA, |
| ) { |
| let stride2 = dfa.stride2; |
| let stride = 1 << stride2; |
| let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; |
| let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; |
| let mut is_anchored = vec![false; dfa.state_len]; |
| let mut newsid = DFA::DEAD; |
| let next_dfa_id = |
| |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); |
| for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
| if oldsid == noncontiguous::NFA::DEAD |
| || oldsid == noncontiguous::NFA::FAIL |
| { |
| remap_unanchored[oldsid] = newsid; |
| remap_anchored[oldsid] = newsid; |
| newsid = next_dfa_id(newsid); |
| } else if oldsid == nnfa.special().start_unanchored_id |
| || oldsid == nnfa.special().start_anchored_id |
| { |
| if oldsid == nnfa.special().start_unanchored_id { |
| remap_unanchored[oldsid] = newsid; |
| remap_anchored[oldsid] = DFA::DEAD; |
| } else { |
| remap_unanchored[oldsid] = DFA::DEAD; |
| remap_anchored[oldsid] = newsid; |
| is_anchored[newsid.as_usize() >> stride2] = true; |
| } |
| if state.is_match() { |
| dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
| } |
| sparse_iter( |
| nnfa, |
| oldsid, |
| &dfa.byte_classes, |
| |_, class, oldnextsid| { |
| let class = usize::from(class); |
| if oldnextsid == noncontiguous::NFA::FAIL { |
| dfa.trans[newsid.as_usize() + class] = DFA::DEAD; |
| } else { |
| dfa.trans[newsid.as_usize() + class] = oldnextsid; |
| } |
| }, |
| ); |
| newsid = next_dfa_id(newsid); |
| } else { |
| let unewsid = newsid; |
| newsid = next_dfa_id(newsid); |
| let anewsid = newsid; |
| newsid = next_dfa_id(newsid); |
| |
| remap_unanchored[oldsid] = unewsid; |
| remap_anchored[oldsid] = anewsid; |
| is_anchored[anewsid.as_usize() >> stride2] = true; |
| if state.is_match() { |
| dfa.set_matches(unewsid, nnfa.iter_matches(oldsid)); |
| dfa.set_matches(anewsid, nnfa.iter_matches(oldsid)); |
| } |
| sparse_iter( |
| nnfa, |
| oldsid, |
| &dfa.byte_classes, |
| |byte, class, oldnextsid| { |
| let class = usize::from(class); |
| if oldnextsid == noncontiguous::NFA::FAIL { |
| dfa.trans[unewsid.as_usize() + class] = nnfa |
| .next_state(Anchored::No, state.fail(), byte); |
| } else { |
| dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
| dfa.trans[anewsid.as_usize() + class] = oldnextsid; |
| } |
| }, |
| ); |
| } |
| } |
| for i in 0..dfa.state_len { |
| let sid = i << stride2; |
| if is_anchored[i] { |
| for next in dfa.trans[sid..][..stride].iter_mut() { |
| *next = remap_anchored[*next]; |
| } |
| } else { |
| for next in dfa.trans[sid..][..stride].iter_mut() { |
| *next = remap_unanchored[*next]; |
| } |
| } |
| } |
| // Now that we've remapped all the IDs in our states, all that's left |
| // is remapping the special state IDs. |
| let old = nnfa.special(); |
| let new = &mut dfa.special; |
| new.max_special_id = remap_anchored[old.max_special_id]; |
| new.max_match_id = remap_anchored[old.max_match_id]; |
| new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; |
| new.start_anchored_id = remap_anchored[old.start_anchored_id]; |
| } |
| |
| /// Set the desired match semantics. |
| /// |
| /// This only applies when using [`Builder::build`] and not |
| /// [`Builder::build_from_noncontiguous`]. |
| /// |
| /// See |
| /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
| /// for more documentation and examples. |
| pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
| self.noncontiguous.match_kind(kind); |
| self |
| } |
| |
| /// Enable ASCII-aware case insensitive matching. |
| /// |
| /// This only applies when using [`Builder::build`] and not |
| /// [`Builder::build_from_noncontiguous`]. |
| /// |
| /// See |
| /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
| /// for more documentation and examples. |
| pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
| self.noncontiguous.ascii_case_insensitive(yes); |
| self |
| } |
| |
| /// Enable heuristic prefilter optimizations. |
| /// |
| /// This only applies when using [`Builder::build`] and not |
| /// [`Builder::build_from_noncontiguous`]. |
| /// |
| /// See |
| /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
| /// for more documentation and examples. |
| pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
| self.noncontiguous.prefilter(yes); |
| self |
| } |
| |
| /// Sets the starting state configuration for the automaton. |
| /// |
| /// See |
| /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
| /// for more documentation and examples. |
| pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { |
| self.start_kind = kind; |
| self |
| } |
| |
| /// A debug setting for whether to attempt to shrink the size of the |
| /// automaton's alphabet or not. |
| /// |
| /// This should never be enabled unless you're debugging an automaton. |
| /// Namely, disabling byte classes makes transitions easier to reason |
| /// about, since they use the actual bytes instead of equivalence classes. |
| /// Disabling this confers no performance benefit at search time. |
| /// |
| /// See |
| /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) |
| /// for more documentation and examples. |
| pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { |
| self.byte_classes = yes; |
| self |
| } |
| } |
| |
| /// Iterate over all possible equivalence class transitions in this state. |
| /// The closure is called for all transitions with a distinct equivalence |
| /// class, even those not explicitly represented in this sparse state. For |
| /// any implicitly defined transitions, the given closure is called with |
| /// the fail state ID. |
| /// |
| /// The closure is guaranteed to be called precisely |
| /// `byte_classes.alphabet_len()` times, once for every possible class in |
| /// ascending order. |
| fn sparse_iter<F: FnMut(u8, u8, StateID)>( |
| nnfa: &noncontiguous::NFA, |
| oldsid: StateID, |
| classes: &ByteClasses, |
| mut f: F, |
| ) { |
| let mut prev_class = None; |
| let mut byte = 0usize; |
| for t in nnfa.iter_trans(oldsid) { |
| while byte < usize::from(t.byte()) { |
| let rep = byte.as_u8(); |
| let class = classes.get(rep); |
| byte += 1; |
| if prev_class != Some(class) { |
| f(rep, class, noncontiguous::NFA::FAIL); |
| prev_class = Some(class); |
| } |
| } |
| let rep = t.byte(); |
| let class = classes.get(rep); |
| byte += 1; |
| if prev_class != Some(class) { |
| f(rep, class, t.next()); |
| prev_class = Some(class); |
| } |
| } |
| for b in byte..=255 { |
| let rep = b.as_u8(); |
| let class = classes.get(rep); |
| if prev_class != Some(class) { |
| f(rep, class, noncontiguous::NFA::FAIL); |
| prev_class = Some(class); |
| } |
| } |
| } |