| use core::{convert::TryFrom, fmt, mem, ops::Range}; |
| |
| use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; |
| |
| use crate::util::{ |
| alphabet::{self, ByteClassSet}, |
| decode_last_utf8, decode_utf8, |
| id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, |
| is_word_byte, is_word_char_fwd, is_word_char_rev, |
| }; |
| |
| pub use self::{ |
| compiler::{Builder, Config}, |
| error::Error, |
| }; |
| |
| mod compiler; |
| mod error; |
| mod map; |
| pub mod pikevm; |
| mod range_trie; |
| |
| /// A map from capture group name to its corresponding capture index. |
| /// |
| /// Since there are always two slots for each capture index, the pair of slots |
| /// corresponding to the capture index for a pattern ID of 0 are indexed at |
| /// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. |
| /// |
| /// This type is actually wrapped inside a Vec indexed by pattern ID on the |
| /// NFA, since multiple patterns may have the same capture group name. |
| /// |
| /// Note that this is somewhat of a sub-optimal representation, since it |
| /// requires a hashmap for each pattern. A better representation would be |
| /// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look |
| /// up a capture index by name without producing a `Arc<str>`, which requires |
| /// an allocation. To fix this, I think we'd need to define our own unsized |
| /// type or something? |
| #[cfg(feature = "std")] |
| type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; |
| #[cfg(not(feature = "std"))] |
| type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; |
| |
| // The NFA API below is not something I'm terribly proud of at the moment. In |
| // particular, it supports both mutating the NFA and actually using the NFA to |
| // perform a search. I think combining these two things muddies the waters a |
| // bit too much. |
| // |
| // I think the issue is that I saw the compiler as the 'builder,' and where |
| // the compiler had the ability to manipulate the internal state of the NFA. |
| // However, one of my goals was to make it possible for others to build their |
| // own NFAs in a way that is *not* couple to the regex-syntax crate. |
| // |
| // So I think really, there should be an NFA, a NFABuilder and then the |
| // internal compiler which uses the NFABuilder API to build an NFA. Alas, at |
| // the time of writing, I kind of ran out of steam. |
| |
| /// A fully compiled Thompson NFA. |
| /// |
| /// The states of the NFA are indexed by state IDs, which are how transitions |
| /// are expressed. |
| #[derive(Clone)] |
| pub struct NFA { |
| /// The state list. This list is guaranteed to be indexable by all starting |
| /// state IDs, and it is also guaranteed to contain at most one `Match` |
| /// state for each pattern compiled into this NFA. (A pattern may not have |
| /// a corresponding `Match` state if a `Match` state is impossible to |
| /// reach.) |
| states: Vec<State>, |
| /// The anchored starting state of this NFA. |
| start_anchored: StateID, |
| /// The unanchored starting state of this NFA. |
| start_unanchored: StateID, |
| /// The starting states for each individual pattern. Starting at any |
| /// of these states will result in only an anchored search for the |
| /// corresponding pattern. The vec is indexed by pattern ID. When the NFA |
| /// contains a single regex, then `start_pattern[0]` and `start_anchored` |
| /// are always equivalent. |
| start_pattern: Vec<StateID>, |
| /// A map from PatternID to its corresponding range of capture slots. Each |
| /// range is guaranteed to be contiguous with the previous range. The |
| /// end of the last range corresponds to the total number of slots needed |
| /// for this NFA. |
| patterns_to_slots: Vec<Range<usize>>, |
| /// A map from capture name to its corresponding index. So e.g., given |
| /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name |
| /// 'word' for pattern ID=0 would corresponding to the index '3'. Its |
| /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. |
| capture_name_to_index: Vec<CaptureNameMap>, |
| /// A map from pattern ID to capture group index to name, if one exists. |
| /// This is effectively the inverse of 'capture_name_to_index'. The outer |
| /// vec is indexed by pattern ID, while the inner vec is index by capture |
| /// index offset for the corresponding pattern. |
| /// |
| /// The first capture group for each pattern is always unnamed and is thus |
| /// always None. |
| capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, |
| /// A representation of equivalence classes over the transitions in this |
| /// NFA. Two bytes in the same equivalence class must not discriminate |
| /// between a match or a non-match. This map can be used to shrink the |
| /// total size of a DFA's transition table with a small match-time cost. |
| /// |
| /// Note that the NFA's transitions are *not* defined in terms of these |
| /// equivalence classes. The NFA's transitions are defined on the original |
| /// byte values. For the most part, this is because they wouldn't really |
| /// help the NFA much since the NFA already uses a sparse representation |
| /// to represent transitions. Byte classes are most effective in a dense |
| /// representation. |
| byte_class_set: ByteClassSet, |
| /// Various facts about this NFA, which can be used to improve failure |
| /// modes (e.g., rejecting DFA construction if an NFA has Unicode word |
| /// boundaries) or for performing optimizations (avoiding an increase in |
| /// states if there are no look-around states). |
| facts: Facts, |
| /// Heap memory used indirectly by NFA states. Since each state might use a |
| /// different amount of heap, we need to keep track of this incrementally. |
| memory_states: usize, |
| } |
| |
| impl NFA { |
| pub fn config() -> Config { |
| Config::new() |
| } |
| |
| pub fn builder() -> Builder { |
| Builder::new() |
| } |
| |
| /// Returns an NFA with no states. Its match semantics are unspecified. |
| /// |
| /// An empty NFA is useful as a starting point for building one. It is |
| /// itself not intended to be used for matching. For example, its starting |
| /// state identifiers are configured to be `0`, but since it has no states, |
| /// the identifiers are invalid. |
| /// |
| /// If you need an NFA that never matches is anything and can be correctly |
| /// used for matching, use [`NFA::never_match`]. |
| #[inline] |
| pub fn empty() -> NFA { |
| NFA { |
| states: vec![], |
| start_anchored: StateID::ZERO, |
| start_unanchored: StateID::ZERO, |
| start_pattern: vec![], |
| patterns_to_slots: vec![], |
| capture_name_to_index: vec![], |
| capture_index_to_name: vec![], |
| byte_class_set: ByteClassSet::empty(), |
| facts: Facts::default(), |
| memory_states: 0, |
| } |
| } |
| |
| /// Returns an NFA with a single regex that always matches at every |
| /// position. |
| #[inline] |
| pub fn always_match() -> NFA { |
| let mut nfa = NFA::empty(); |
| // Since we're only adding one pattern, these are guaranteed to work. |
| let start = nfa.add_match().unwrap(); |
| assert_eq!(start.as_usize(), 0); |
| let pid = nfa.finish_pattern(start).unwrap(); |
| assert_eq!(pid.as_usize(), 0); |
| nfa |
| } |
| |
| /// Returns an NFA that never matches at any position. It contains no |
| /// regexes. |
| #[inline] |
| pub fn never_match() -> NFA { |
| let mut nfa = NFA::empty(); |
| // Since we're only adding one state, this can never fail. |
| nfa.add_fail().unwrap(); |
| nfa |
| } |
| |
| /// Return the number of states in this NFA. |
| /// |
| /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. |
| #[inline] |
| pub fn len(&self) -> usize { |
| self.states.len() |
| } |
| |
| /// Returns the total number of distinct match states in this NFA. |
| /// Stated differently, this returns the total number of regex patterns |
| /// used to build this NFA. |
| /// |
| /// This may return zero if the NFA was constructed with no patterns. In |
| /// this case, and only this case, the NFA can never produce a match for |
| /// any input. |
| /// |
| /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. |
| #[inline] |
| pub fn pattern_len(&self) -> usize { |
| self.start_pattern.len() |
| } |
| |
| /// Returns the pattern ID of the pattern currently being compiled by this |
| /// NFA. |
| fn current_pattern_id(&self) -> PatternID { |
| // This always works because we never permit more patterns in |
| // 'start_pattern' than can be addressed by PatternID. Also, we only |
| // add a new entry to 'start_pattern' once we finish compiling a |
| // pattern. Thus, the length refers to the ID of the current pattern |
| // being compiled. |
| PatternID::new(self.start_pattern.len()).unwrap() |
| } |
| |
| /// Returns the total number of capturing groups in this NFA. |
| /// |
| /// This includes the special 0th capture group that is always present and |
| /// captures the start and end offset of the entire match. |
| /// |
| /// This is a convenience routine for `nfa.capture_slot_len() / 2`. |
| #[inline] |
| pub fn capture_len(&self) -> usize { |
| let slots = self.capture_slot_len(); |
| // This assert is guaranteed to pass since the NFA construction process |
| // guarantees that it is always true. |
| assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); |
| slots / 2 |
| } |
| |
| /// Returns the total number of capturing slots in this NFA. |
| /// |
| /// This value is guaranteed to be a multiple of 2. (Where each capturing |
| /// group has precisely two capturing slots in the NFA.) |
| #[inline] |
| pub fn capture_slot_len(&self) -> usize { |
| self.patterns_to_slots.last().map_or(0, |r| r.end) |
| } |
| |
| /// Return a range of capture slots for the given pattern. |
| /// |
| /// The range returned is guaranteed to be contiguous with ranges for |
| /// adjacent patterns. |
| /// |
| /// This panics if the given pattern ID is greater than or equal to the |
| /// number of patterns in this NFA. |
| #[inline] |
| pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { |
| self.patterns_to_slots[pid].clone() |
| } |
| |
| /// Return the capture group index corresponding to the given name in the |
| /// given pattern. If no such capture group name exists in the given |
| /// pattern, then this returns `None`. |
| /// |
| /// If the given pattern ID is invalid, then this panics. |
| #[inline] |
| pub fn capture_name_to_index( |
| &self, |
| pid: PatternID, |
| name: &str, |
| ) -> Option<usize> { |
| assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); |
| self.capture_name_to_index[pid].get(name).cloned() |
| } |
| |
| // TODO: add iterators over capture group names. |
| // Do we also permit indexing? |
| |
| /// Returns an iterator over all pattern IDs in this NFA. |
| #[inline] |
| pub fn patterns(&self) -> PatternIter { |
| PatternIter { |
| it: PatternID::iter(self.pattern_len()), |
| _marker: core::marker::PhantomData, |
| } |
| } |
| |
| /// Return the ID of the initial anchored state of this NFA. |
| #[inline] |
| pub fn start_anchored(&self) -> StateID { |
| self.start_anchored |
| } |
| |
| /// Set the anchored starting state ID for this NFA. |
| #[inline] |
| pub fn set_start_anchored(&mut self, id: StateID) { |
| self.start_anchored = id; |
| } |
| |
| /// Return the ID of the initial unanchored state of this NFA. |
| #[inline] |
| pub fn start_unanchored(&self) -> StateID { |
| self.start_unanchored |
| } |
| |
| /// Set the unanchored starting state ID for this NFA. |
| #[inline] |
| pub fn set_start_unanchored(&mut self, id: StateID) { |
| self.start_unanchored = id; |
| } |
| |
| /// Return the ID of the initial anchored state for the given pattern. |
| /// |
| /// If the pattern doesn't exist in this NFA, then this panics. |
| #[inline] |
| pub fn start_pattern(&self, pid: PatternID) -> StateID { |
| self.start_pattern[pid] |
| } |
| |
| /// Get the byte class set for this NFA. |
| #[inline] |
| pub fn byte_class_set(&self) -> &ByteClassSet { |
| &self.byte_class_set |
| } |
| |
| /// Return a reference to the NFA state corresponding to the given ID. |
| #[inline] |
| pub fn state(&self, id: StateID) -> &State { |
| &self.states[id] |
| } |
| |
| /// Returns a slice of all states in this NFA. |
| /// |
| /// The slice returned may be indexed by a `StateID` generated by `add`. |
| #[inline] |
| pub fn states(&self) -> &[State] { |
| &self.states |
| } |
| |
| #[inline] |
| pub fn is_always_start_anchored(&self) -> bool { |
| self.start_anchored() == self.start_unanchored() |
| } |
| |
| #[inline] |
| pub fn has_any_look(&self) -> bool { |
| self.facts.has_any_look() |
| } |
| |
| #[inline] |
| pub fn has_any_anchor(&self) -> bool { |
| self.facts.has_any_anchor() |
| } |
| |
| #[inline] |
| pub fn has_word_boundary(&self) -> bool { |
| self.has_word_boundary_unicode() || self.has_word_boundary_ascii() |
| } |
| |
| #[inline] |
| pub fn has_word_boundary_unicode(&self) -> bool { |
| self.facts.has_word_boundary_unicode() |
| } |
| |
| #[inline] |
| pub fn has_word_boundary_ascii(&self) -> bool { |
| self.facts.has_word_boundary_ascii() |
| } |
| |
| /// Returns the memory usage, in bytes, of this NFA. |
| /// |
| /// This does **not** include the stack size used up by this NFA. To |
| /// compute that, use `std::mem::size_of::<NFA>()`. |
| #[inline] |
| pub fn memory_usage(&self) -> usize { |
| self.states.len() * mem::size_of::<State>() |
| + self.memory_states |
| + self.start_pattern.len() * mem::size_of::<StateID>() |
| } |
| |
| // Why do we define a bunch of 'add_*' routines below instead of just |
| // defining a single 'add' routine that accepts a 'State'? Indeed, for most |
| // of the 'add_*' routines below, such a simple API would be more than |
| // appropriate. Unfortunately, adding capture states and, to a lesser |
| // extent, match states, is a bit more complex. Namely, when we add a |
| // capture state, we *really* want to know the corresponding capture |
| // group's name and index and what not, so that we can update other state |
| // inside this NFA. But, e.g., the capture group name is not and should |
| // not be included in 'State::Capture'. So what are our choices? |
| // |
| // 1) Define one 'add' and require some additional optional parameters. |
| // This feels quite ugly, and adds unnecessary complexity to more common |
| // and simpler cases. |
| // |
| // 2) Do what we do below. The sad thing is that our API is bigger with |
| // more methods. But each method is very specific and hopefully simple. |
| // |
| // 3) Define a new enum, say, 'StateWithInfo', or something that permits |
| // providing both a State and some extra ancillary info in some cases. This |
| // doesn't seem too bad to me, but seems slightly worse than (2) because of |
| // the additional type required. |
| // |
| // 4) Abandon the idea that we have to specify things like the capture |
| // group name when we add the Capture state to the NFA. We would then need |
| // to add other methods that permit the caller to add this additional state |
| // "out of band." Other than it introducing some additional complexity, I |
| // decided against this because I wanted the NFA builder API to make it |
| // as hard as possible to build a bad or invalid NFA. Using the approach |
| // below, as you'll see, permits us to do a lot of strict checking of our |
| // inputs and return an error if we see something we don't expect. |
| |
| pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { |
| self.byte_class_set.set_range(range.start, range.end); |
| self.add_state(State::Range { range }) |
| } |
| |
| pub fn add_sparse( |
| &mut self, |
| sparse: SparseTransitions, |
| ) -> Result<StateID, Error> { |
| for range in sparse.ranges.iter() { |
| self.byte_class_set.set_range(range.start, range.end); |
| } |
| self.add_state(State::Sparse(sparse)) |
| } |
| |
| pub fn add_look( |
| &mut self, |
| next: StateID, |
| look: Look, |
| ) -> Result<StateID, Error> { |
| self.facts.set_has_any_look(true); |
| look.add_to_byteset(&mut self.byte_class_set); |
| match look { |
| Look::StartLine |
| | Look::EndLine |
| | Look::StartText |
| | Look::EndText => { |
| self.facts.set_has_any_anchor(true); |
| } |
| Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { |
| self.facts.set_has_word_boundary_unicode(true); |
| } |
| Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { |
| self.facts.set_has_word_boundary_ascii(true); |
| } |
| } |
| self.add_state(State::Look { look, next }) |
| } |
| |
| pub fn add_union( |
| &mut self, |
| alternates: Box<[StateID]>, |
| ) -> Result<StateID, Error> { |
| self.add_state(State::Union { alternates }) |
| } |
| |
| pub fn add_capture_start( |
| &mut self, |
| next_id: StateID, |
| capture_index: u32, |
| name: Option<Arc<str>>, |
| ) -> Result<StateID, Error> { |
| let pid = self.current_pattern_id(); |
| let capture_index = match usize::try_from(capture_index) { |
| Err(_) => { |
| return Err(Error::invalid_capture_index(core::usize::MAX)) |
| } |
| Ok(capture_index) => capture_index, |
| }; |
| // Do arithmetic to find our absolute slot index first, to make sure |
| // the index is at least possibly valid (doesn't overflow). |
| let relative_slot = match capture_index.checked_mul(2) { |
| Some(relative_slot) => relative_slot, |
| None => return Err(Error::invalid_capture_index(capture_index)), |
| }; |
| let slot = match relative_slot.checked_add(self.capture_slot_len()) { |
| Some(slot) => slot, |
| None => return Err(Error::invalid_capture_index(capture_index)), |
| }; |
| // Make sure we have space to insert our (pid,index)|-->name mapping. |
| if pid.as_usize() >= self.capture_index_to_name.len() { |
| // Note that we require that if you're adding capturing groups, |
| // then there must be at least one capturing group per pattern. |
| // Moreover, whenever we expand our space here, it should always |
| // first be for the first capture group (at index==0). |
| if pid.as_usize() > self.capture_index_to_name.len() |
| || capture_index > 0 |
| { |
| return Err(Error::invalid_capture_index(capture_index)); |
| } |
| self.capture_name_to_index.push(CaptureNameMap::new()); |
| self.capture_index_to_name.push(vec![]); |
| } |
| if capture_index >= self.capture_index_to_name[pid].len() { |
| // We require that capturing groups are added in correspondence |
| // to their index. So no discontinuous indices. This is likely |
| // overly strict, but also makes it simpler to provide guarantees |
| // about our capturing group data. |
| if capture_index > self.capture_index_to_name[pid].len() { |
| return Err(Error::invalid_capture_index(capture_index)); |
| } |
| self.capture_index_to_name[pid].push(None); |
| } |
| if let Some(ref name) = name { |
| self.capture_name_to_index[pid] |
| .insert(Arc::clone(name), capture_index); |
| } |
| self.capture_index_to_name[pid][capture_index] = name; |
| self.add_state(State::Capture { next: next_id, slot }) |
| } |
| |
| pub fn add_capture_end( |
| &mut self, |
| next_id: StateID, |
| capture_index: u32, |
| ) -> Result<StateID, Error> { |
| let pid = self.current_pattern_id(); |
| let capture_index = match usize::try_from(capture_index) { |
| Err(_) => { |
| return Err(Error::invalid_capture_index(core::usize::MAX)) |
| } |
| Ok(capture_index) => capture_index, |
| }; |
| // If we haven't already added this capture group via a corresponding |
| // 'add_capture_start' call, then we consider the index given to be |
| // invalid. |
| if pid.as_usize() >= self.capture_index_to_name.len() |
| || capture_index >= self.capture_index_to_name[pid].len() |
| { |
| return Err(Error::invalid_capture_index(capture_index)); |
| } |
| // Since we've already confirmed that this capture index is invalid |
| // and has a corresponding starting slot, we know the multiplcation |
| // has already been done and succeeded. |
| let relative_slot_start = capture_index.checked_mul(2).unwrap(); |
| let relative_slot = match relative_slot_start.checked_add(1) { |
| Some(relative_slot) => relative_slot, |
| None => return Err(Error::invalid_capture_index(capture_index)), |
| }; |
| let slot = match relative_slot.checked_add(self.capture_slot_len()) { |
| Some(slot) => slot, |
| None => return Err(Error::invalid_capture_index(capture_index)), |
| }; |
| self.add_state(State::Capture { next: next_id, slot }) |
| } |
| |
| pub fn add_fail(&mut self) -> Result<StateID, Error> { |
| self.add_state(State::Fail) |
| } |
| |
| /// Add a new match state to this NFA and return its state ID. |
| pub fn add_match(&mut self) -> Result<StateID, Error> { |
| let pattern_id = self.current_pattern_id(); |
| let sid = self.add_state(State::Match { id: pattern_id })?; |
| Ok(sid) |
| } |
| |
| /// Finish compiling the current pattern and return its identifier. The |
| /// given ID should be the state ID corresponding to the anchored starting |
| /// state for matching this pattern. |
| pub fn finish_pattern( |
| &mut self, |
| start_id: StateID, |
| ) -> Result<PatternID, Error> { |
| // We've gotta make sure that we never permit the user to add more |
| // patterns than we can identify. So if we're already at the limit, |
| // then return an error. This is somewhat non-ideal since this won't |
| // result in an error until trying to complete the compilation of a |
| // pattern instead of starting it. |
| if self.start_pattern.len() >= PatternID::LIMIT { |
| return Err(Error::too_many_patterns( |
| self.start_pattern.len().saturating_add(1), |
| )); |
| } |
| let pid = self.current_pattern_id(); |
| self.start_pattern.push(start_id); |
| // Add the number of new slots created by this pattern. This is always |
| // equivalent to '2 * caps.len()', where 'caps.len()' is the number of |
| // new capturing groups introduced by the pattern we're finishing. |
| let new_cap_groups = self |
| .capture_index_to_name |
| .get(pid.as_usize()) |
| .map_or(0, |caps| caps.len()); |
| let new_slots = match new_cap_groups.checked_mul(2) { |
| Some(new_slots) => new_slots, |
| None => { |
| // Just return the biggest index that we know exists. |
| let index = new_cap_groups.saturating_sub(1); |
| return Err(Error::invalid_capture_index(index)); |
| } |
| }; |
| let slot_start = self.capture_slot_len(); |
| self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); |
| Ok(pid) |
| } |
| |
| fn add_state(&mut self, state: State) -> Result<StateID, Error> { |
| let id = StateID::new(self.states.len()) |
| .map_err(|_| Error::too_many_states(self.states.len()))?; |
| self.memory_states += state.memory_usage(); |
| self.states.push(state); |
| Ok(id) |
| } |
| |
| /// Remap the transitions in every state of this NFA using the given map. |
| /// The given map should be indexed according to state ID namespace used by |
| /// the transitions of the states currently in this NFA. |
| /// |
| /// This may be used during the final phases of an NFA compiler, which |
| /// turns its intermediate NFA into the final NFA. Remapping may be |
| /// required to bring the state pointers from the intermediate NFA to the |
| /// final NFA. |
| pub fn remap(&mut self, old_to_new: &[StateID]) { |
| for state in &mut self.states { |
| state.remap(old_to_new); |
| } |
| self.start_anchored = old_to_new[self.start_anchored]; |
| self.start_unanchored = old_to_new[self.start_unanchored]; |
| for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { |
| *id = old_to_new[*id]; |
| } |
| } |
| |
| /// Clear this NFA such that it has zero states and is otherwise "empty." |
| /// |
| /// An empty NFA is useful as a starting point for building one. It is |
| /// itself not intended to be used for matching. For example, its starting |
| /// state identifiers are configured to be `0`, but since it has no states, |
| /// the identifiers are invalid. |
| pub fn clear(&mut self) { |
| self.states.clear(); |
| self.start_anchored = StateID::ZERO; |
| self.start_unanchored = StateID::ZERO; |
| self.start_pattern.clear(); |
| self.patterns_to_slots.clear(); |
| self.capture_name_to_index.clear(); |
| self.capture_index_to_name.clear(); |
| self.byte_class_set = ByteClassSet::empty(); |
| self.facts = Facts::default(); |
| self.memory_states = 0; |
| } |
| } |
| |
| impl fmt::Debug for NFA { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| writeln!(f, "thompson::NFA(")?; |
| for (sid, state) in self.states.iter().with_state_ids() { |
| let status = if sid == self.start_anchored { |
| '^' |
| } else if sid == self.start_unanchored { |
| '>' |
| } else { |
| ' ' |
| }; |
| writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; |
| } |
| if self.pattern_len() > 1 { |
| writeln!(f, "")?; |
| for pid in self.patterns() { |
| let sid = self.start_pattern(pid); |
| writeln!( |
| f, |
| "START({:06?}): {:?}", |
| pid.as_usize(), |
| sid.as_usize() |
| )?; |
| } |
| } |
| writeln!(f, "")?; |
| writeln!( |
| f, |
| "transition equivalence classes: {:?}", |
| self.byte_class_set().byte_classes() |
| )?; |
| writeln!(f, ")")?; |
| Ok(()) |
| } |
| } |
| |
| /// A state in a final compiled NFA. |
| #[derive(Clone, Eq, PartialEq)] |
| pub enum State { |
| /// A state that transitions to `next` if and only if the current input |
| /// byte is in the range `[start, end]` (inclusive). |
| /// |
| /// This is a special case of Sparse in that it encodes only one transition |
| /// (and therefore avoids the allocation). |
| Range { range: Transition }, |
| /// A state with possibly many transitions, represented in a sparse |
| /// fashion. Transitions are ordered lexicographically by input range. As |
| /// such, this may only be used when every transition has equal priority. |
| /// (In practice, this is only used for encoding UTF-8 automata.) |
| Sparse(SparseTransitions), |
| /// A conditional epsilon transition satisfied via some sort of |
| /// look-around. |
| Look { look: Look, next: StateID }, |
| /// An alternation such that there exists an epsilon transition to all |
| /// states in `alternates`, where matches found via earlier transitions |
| /// are preferred over later transitions. |
| Union { alternates: Box<[StateID]> }, |
| /// An empty state that records a capture location. |
| /// |
| /// From the perspective of finite automata, this is precisely equivalent |
| /// to an epsilon transition, but serves the purpose of instructing NFA |
| /// simulations to record additional state when the finite state machine |
| /// passes through this epsilon transition. |
| /// |
| /// These transitions are treated as epsilon transitions with no additional |
| /// effects in DFAs. |
| /// |
| /// 'slot' in this context refers to the specific capture group offset that |
| /// is being recorded. Each capturing group has two slots corresponding to |
| /// the start and end of the matching portion of that group. |
| /// A fail state. When encountered, the automaton is guaranteed to never |
| /// reach a match state. |
| Capture { next: StateID, slot: usize }, |
| /// A state that cannot be transitioned out of. If a search reaches this |
| /// state, then no match is possible and the search should terminate. |
| Fail, |
| /// A match state. There is exactly one such occurrence of this state for |
| /// each regex compiled into the NFA. |
| Match { id: PatternID }, |
| } |
| |
| impl State { |
| /// Returns true if and only if this state contains one or more epsilon |
| /// transitions. |
| #[inline] |
| pub fn is_epsilon(&self) -> bool { |
| match *self { |
| State::Range { .. } |
| | State::Sparse { .. } |
| | State::Fail |
| | State::Match { .. } => false, |
| State::Look { .. } |
| | State::Union { .. } |
| | State::Capture { .. } => true, |
| } |
| } |
| |
| /// Returns the heap memory usage of this NFA state in bytes. |
| fn memory_usage(&self) -> usize { |
| match *self { |
| State::Range { .. } |
| | State::Look { .. } |
| | State::Capture { .. } |
| | State::Match { .. } |
| | State::Fail => 0, |
| State::Sparse(SparseTransitions { ref ranges }) => { |
| ranges.len() * mem::size_of::<Transition>() |
| } |
| State::Union { ref alternates } => { |
| alternates.len() * mem::size_of::<StateID>() |
| } |
| } |
| } |
| |
| /// Remap the transitions in this state using the given map. Namely, the |
| /// given map should be indexed according to the transitions currently |
| /// in this state. |
| /// |
| /// This is used during the final phase of the NFA compiler, which turns |
| /// its intermediate NFA into the final NFA. |
| fn remap(&mut self, remap: &[StateID]) { |
| match *self { |
| State::Range { ref mut range } => range.next = remap[range.next], |
| State::Sparse(SparseTransitions { ref mut ranges }) => { |
| for r in ranges.iter_mut() { |
| r.next = remap[r.next]; |
| } |
| } |
| State::Look { ref mut next, .. } => *next = remap[*next], |
| State::Union { ref mut alternates } => { |
| for alt in alternates.iter_mut() { |
| *alt = remap[*alt]; |
| } |
| } |
| State::Capture { ref mut next, .. } => *next = remap[*next], |
| State::Fail => {} |
| State::Match { .. } => {} |
| } |
| } |
| } |
| |
| impl fmt::Debug for State { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match *self { |
| State::Range { ref range } => range.fmt(f), |
| State::Sparse(SparseTransitions { ref ranges }) => { |
| let rs = ranges |
| .iter() |
| .map(|t| format!("{:?}", t)) |
| .collect::<Vec<String>>() |
| .join(", "); |
| write!(f, "sparse({})", rs) |
| } |
| State::Look { ref look, next } => { |
| write!(f, "{:?} => {:?}", look, next.as_usize()) |
| } |
| State::Union { ref alternates } => { |
| let alts = alternates |
| .iter() |
| .map(|id| format!("{:?}", id.as_usize())) |
| .collect::<Vec<String>>() |
| .join(", "); |
| write!(f, "alt({})", alts) |
| } |
| State::Capture { next, slot } => { |
| write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) |
| } |
| State::Fail => write!(f, "FAIL"), |
| State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), |
| } |
| } |
| } |
| |
| /// A collection of facts about an NFA. |
| /// |
| /// There are no real cohesive principles behind what gets put in here. For |
| /// the most part, it is implementation driven. |
| #[derive(Clone, Copy, Debug, Default)] |
| struct Facts { |
| /// Various yes/no facts about this NFA. |
| bools: u16, |
| } |
| |
| impl Facts { |
| define_bool!(0, has_any_look, set_has_any_look); |
| define_bool!(1, has_any_anchor, set_has_any_anchor); |
| define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); |
| define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); |
| } |
| |
| /// A sequence of transitions used to represent a sparse state. |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub struct SparseTransitions { |
| pub ranges: Box<[Transition]>, |
| } |
| |
| impl SparseTransitions { |
| pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { |
| haystack.get(at).and_then(|&b| self.matches_byte(b)) |
| } |
| |
| pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { |
| unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) |
| } |
| |
| pub fn matches_byte(&self, byte: u8) -> Option<StateID> { |
| for t in self.ranges.iter() { |
| if t.start > byte { |
| break; |
| } else if t.matches_byte(byte) { |
| return Some(t.next); |
| } |
| } |
| None |
| |
| /* |
| // This is an alternative implementation that uses binary search. In |
| // some ad hoc experiments, like |
| // |
| // smallishru=OpenSubtitles2018.raw.sample.smallish.ru |
| // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' |
| // |
| // I could not observe any improvement, and in fact, things seemed to |
| // be a bit slower. |
| self.ranges |
| .binary_search_by(|t| { |
| if t.end < byte { |
| core::cmp::Ordering::Less |
| } else if t.start > byte { |
| core::cmp::Ordering::Greater |
| } else { |
| core::cmp::Ordering::Equal |
| } |
| }) |
| .ok() |
| .map(|i| self.ranges[i].next) |
| */ |
| } |
| } |
| |
| /// A transition to another state, only if the given byte falls in the |
| /// inclusive range specified. |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct Transition { |
| pub start: u8, |
| pub end: u8, |
| pub next: StateID, |
| } |
| |
| impl Transition { |
| pub fn matches(&self, haystack: &[u8], at: usize) -> bool { |
| haystack.get(at).map_or(false, |&b| self.matches_byte(b)) |
| } |
| |
| pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { |
| unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) |
| } |
| |
| pub fn matches_byte(&self, byte: u8) -> bool { |
| self.start <= byte && byte <= self.end |
| } |
| } |
| |
| impl fmt::Debug for Transition { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| use crate::util::DebugByte; |
| |
| let Transition { start, end, next } = *self; |
| if self.start == self.end { |
| write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) |
| } else { |
| write!( |
| f, |
| "{:?}-{:?} => {:?}", |
| DebugByte(start), |
| DebugByte(end), |
| next.as_usize(), |
| ) |
| } |
| } |
| } |
| |
| /// A conditional NFA epsilon transition. |
| /// |
| /// A simulation of the NFA can only move through this epsilon transition if |
| /// the current position satisfies some look-around property. Some assertions |
| /// are look-behind (StartLine, StartText), some assertions are look-ahead |
| /// (EndLine, EndText) while other assertions are both look-behind and |
| /// look-ahead (WordBoundary*). |
| #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| pub enum Look { |
| /// The previous position is either `\n` or the current position is the |
| /// beginning of the haystack (i.e., at position `0`). |
| StartLine = 1 << 0, |
| /// The next position is either `\n` or the current position is the end of |
| /// the haystack (i.e., at position `haystack.len()`). |
| EndLine = 1 << 1, |
| /// The current position is the beginning of the haystack (i.e., at |
| /// position `0`). |
| StartText = 1 << 2, |
| /// The current position is the end of the haystack (i.e., at position |
| /// `haystack.len()`). |
| EndText = 1 << 3, |
| /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` |
| /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only |
| /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if |
| /// `i=haystack.len()`, then `is_word(n)=false`. |
| WordBoundaryUnicode = 1 << 4, |
| /// Same as for `WordBoundaryUnicode`, but requires that |
| /// `is_word(p) == is_word(n)`. |
| WordBoundaryUnicodeNegate = 1 << 5, |
| /// When tested at position `i`, where `p=haystack[i-1]` and |
| /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) |
| /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if |
| /// `i=haystack.len()`, then `is_word(n)=false`. |
| WordBoundaryAscii = 1 << 6, |
| /// Same as for `WordBoundaryAscii`, but requires that |
| /// `is_word(p) == is_word(n)`. |
| /// |
| /// Note that it is possible for this assertion to match at positions that |
| /// split the UTF-8 encoding of a codepoint. For this reason, this may only |
| /// be used when UTF-8 mode is disable in the regex syntax. |
| WordBoundaryAsciiNegate = 1 << 7, |
| } |
| |
| impl Look { |
| #[inline(always)] |
| pub fn matches(&self, bytes: &[u8], at: usize) -> bool { |
| match *self { |
| Look::StartLine => at == 0 || bytes[at - 1] == b'\n', |
| Look::EndLine => at == bytes.len() || bytes[at] == b'\n', |
| Look::StartText => at == 0, |
| Look::EndText => at == bytes.len(), |
| Look::WordBoundaryUnicode => { |
| let word_before = is_word_char_rev(bytes, at); |
| let word_after = is_word_char_fwd(bytes, at); |
| word_before != word_after |
| } |
| Look::WordBoundaryUnicodeNegate => { |
| // This is pretty subtle. Why do we need to do UTF-8 decoding |
| // here? Well... at time of writing, the is_word_char_{fwd,rev} |
| // routines will only return true if there is a valid UTF-8 |
| // encoding of a "word" codepoint, and false in every other |
| // case (including invalid UTF-8). This means that in regions |
| // of invalid UTF-8 (which might be a subset of valid UTF-8!), |
| // it would result in \B matching. While this would be |
| // questionable in the context of truly invalid UTF-8, it is |
| // *certainly* wrong to report match boundaries that split the |
| // encoding of a codepoint. So to work around this, we ensure |
| // that we can decode a codepoint on either side of `at`. If |
| // either direction fails, then we don't permit \B to match at |
| // all. |
| // |
| // Now, this isn't exactly optimal from a perf perspective. We |
| // could try and detect this in is_word_char_{fwd,rev}, but |
| // it's not clear if it's worth it. \B is, after all, rarely |
| // used. |
| // |
| // And in particular, we do *not* have to do this with \b, |
| // because \b *requires* that at least one side of `at` be a |
| // "word" codepoint, which in turn implies one side of `at` |
| // must be valid UTF-8. This in turn implies that \b can never |
| // split a valid UTF-8 encoding of a codepoint. In the case |
| // where one side of `at` is truly invalid UTF-8 and the other |
| // side IS a word codepoint, then we want \b to match since it |
| // represents a valid UTF-8 boundary. It also makes sense. For |
| // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. |
| let word_before = at > 0 |
| && match decode_last_utf8(&bytes[..at]) { |
| None | Some(Err(_)) => return false, |
| Some(Ok(_)) => is_word_char_rev(bytes, at), |
| }; |
| let word_after = at < bytes.len() |
| && match decode_utf8(&bytes[at..]) { |
| None | Some(Err(_)) => return false, |
| Some(Ok(_)) => is_word_char_fwd(bytes, at), |
| }; |
| word_before == word_after |
| } |
| Look::WordBoundaryAscii => { |
| let word_before = at > 0 && is_word_byte(bytes[at - 1]); |
| let word_after = at < bytes.len() && is_word_byte(bytes[at]); |
| word_before != word_after |
| } |
| Look::WordBoundaryAsciiNegate => { |
| let word_before = at > 0 && is_word_byte(bytes[at - 1]); |
| let word_after = at < bytes.len() && is_word_byte(bytes[at]); |
| word_before == word_after |
| } |
| } |
| } |
| |
| /// Create a look-around assertion from its corresponding integer (as |
| /// defined in `Look`). If the given integer does not correspond to any |
| /// assertion, then None is returned. |
| fn from_int(n: u8) -> Option<Look> { |
| match n { |
| 0b0000_0001 => Some(Look::StartLine), |
| 0b0000_0010 => Some(Look::EndLine), |
| 0b0000_0100 => Some(Look::StartText), |
| 0b0000_1000 => Some(Look::EndText), |
| 0b0001_0000 => Some(Look::WordBoundaryUnicode), |
| 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), |
| 0b0100_0000 => Some(Look::WordBoundaryAscii), |
| 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), |
| _ => None, |
| } |
| } |
| |
| /// Flip the look-around assertion to its equivalent for reverse searches. |
| fn reversed(&self) -> Look { |
| match *self { |
| Look::StartLine => Look::EndLine, |
| Look::EndLine => Look::StartLine, |
| Look::StartText => Look::EndText, |
| Look::EndText => Look::StartText, |
| Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, |
| Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, |
| Look::WordBoundaryAscii => Look::WordBoundaryAscii, |
| Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, |
| } |
| } |
| |
| /// Split up the given byte classes into equivalence classes in a way that |
| /// is consistent with this look-around assertion. |
| fn add_to_byteset(&self, set: &mut ByteClassSet) { |
| match *self { |
| Look::StartText | Look::EndText => {} |
| Look::StartLine | Look::EndLine => { |
| set.set_range(b'\n', b'\n'); |
| } |
| Look::WordBoundaryUnicode |
| | Look::WordBoundaryUnicodeNegate |
| | Look::WordBoundaryAscii |
| | Look::WordBoundaryAsciiNegate => { |
| // We need to mark all ranges of bytes whose pairs result in |
| // evaluating \b differently. This isn't technically correct |
| // for Unicode word boundaries, but DFAs can't handle those |
| // anyway, and thus, the byte classes don't need to either |
| // since they are themselves only used in DFAs. |
| let iswb = regex_syntax::is_word_byte; |
| let mut b1: u16 = 0; |
| let mut b2: u16; |
| while b1 <= 255 { |
| b2 = b1 + 1; |
| while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
| b2 += 1; |
| } |
| set.set_range(b1 as u8, (b2 - 1) as u8); |
| b1 = b2; |
| } |
| } |
| } |
| } |
| } |
| |
| /// LookSet is a memory-efficient set of look-around assertions. Callers may |
| /// idempotently insert or remove any look-around assertion from a set. |
| #[repr(transparent)] |
| #[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] |
| pub(crate) struct LookSet { |
| set: u8, |
| } |
| |
| impl LookSet { |
| /// Return a LookSet from its representation. |
| pub(crate) fn from_repr(repr: u8) -> LookSet { |
| LookSet { set: repr } |
| } |
| |
| /// Return a mutable LookSet from a mutable pointer to its representation. |
| pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { |
| // SAFETY: This is safe since a LookSet is repr(transparent) where its |
| // repr is a u8. |
| unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } |
| } |
| |
| /// Return true if and only if this set is empty. |
| pub(crate) fn is_empty(&self) -> bool { |
| self.set == 0 |
| } |
| |
| /// Clears this set such that it has no assertions in it. |
| pub(crate) fn clear(&mut self) { |
| self.set = 0; |
| } |
| |
| /// Insert the given look-around assertion into this set. If the assertion |
| /// already exists, then this is a no-op. |
| pub(crate) fn insert(&mut self, look: Look) { |
| self.set |= look as u8; |
| } |
| |
| /// Remove the given look-around assertion from this set. If the assertion |
| /// is not in this set, then this is a no-op. |
| #[cfg(test)] |
| pub(crate) fn remove(&mut self, look: Look) { |
| self.set &= !(look as u8); |
| } |
| |
| /// Return true if and only if the given assertion is in this set. |
| pub(crate) fn contains(&self, look: Look) -> bool { |
| (look as u8) & self.set != 0 |
| } |
| |
| /// Subtract the given `other` set from the `self` set and return a new |
| /// set. |
| pub(crate) fn subtract(&self, other: LookSet) -> LookSet { |
| LookSet { set: self.set & !other.set } |
| } |
| |
| /// Return the intersection of the given `other` set with the `self` set |
| /// and return the resulting set. |
| pub(crate) fn intersect(&self, other: LookSet) -> LookSet { |
| LookSet { set: self.set & other.set } |
| } |
| } |
| |
| impl core::fmt::Debug for LookSet { |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| let mut members = vec![]; |
| for i in 0..8 { |
| let look = match Look::from_int(1 << i) { |
| None => continue, |
| Some(look) => look, |
| }; |
| if self.contains(look) { |
| members.push(look); |
| } |
| } |
| f.debug_tuple("LookSet").field(&members).finish() |
| } |
| } |
| |
| /// An iterator over all pattern IDs in an NFA. |
| pub struct PatternIter<'a> { |
| it: PatternIDIter, |
| /// We explicitly associate a lifetime with this iterator even though we |
| /// don't actually borrow anything from the NFA. We do this for backward |
| /// compatibility purposes. If we ever do need to borrow something from |
| /// the NFA, then we can and just get rid of this marker without breaking |
| /// the public API. |
| _marker: core::marker::PhantomData<&'a ()>, |
| } |
| |
| impl<'a> Iterator for PatternIter<'a> { |
| type Item = PatternID; |
| |
| fn next(&mut self) -> Option<PatternID> { |
| self.it.next() |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| // TODO: Replace tests using DFA with NFA matching engine once implemented. |
| use crate::dfa::{dense, Automaton}; |
| |
| #[test] |
| fn always_match() { |
| let nfa = NFA::always_match(); |
| let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); |
| let find = |input, start, end| { |
| dfa.find_leftmost_fwd_at(None, None, input, start, end) |
| .unwrap() |
| .map(|m| m.offset()) |
| }; |
| |
| assert_eq!(Some(0), find(b"", 0, 0)); |
| assert_eq!(Some(0), find(b"a", 0, 1)); |
| assert_eq!(Some(1), find(b"a", 1, 1)); |
| assert_eq!(Some(0), find(b"ab", 0, 2)); |
| assert_eq!(Some(1), find(b"ab", 1, 2)); |
| assert_eq!(Some(2), find(b"ab", 2, 2)); |
| } |
| |
| #[test] |
| fn never_match() { |
| let nfa = NFA::never_match(); |
| let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); |
| let find = |input, start, end| { |
| dfa.find_leftmost_fwd_at(None, None, input, start, end) |
| .unwrap() |
| .map(|m| m.offset()) |
| }; |
| |
| assert_eq!(None, find(b"", 0, 0)); |
| assert_eq!(None, find(b"a", 0, 1)); |
| assert_eq!(None, find(b"a", 1, 1)); |
| assert_eq!(None, find(b"ab", 0, 2)); |
| assert_eq!(None, find(b"ab", 1, 2)); |
| assert_eq!(None, find(b"ab", 2, 2)); |
| } |
| |
| #[test] |
| fn look_set() { |
| let mut f = LookSet::default(); |
| assert!(!f.contains(Look::StartText)); |
| assert!(!f.contains(Look::EndText)); |
| assert!(!f.contains(Look::StartLine)); |
| assert!(!f.contains(Look::EndLine)); |
| assert!(!f.contains(Look::WordBoundaryUnicode)); |
| assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); |
| assert!(!f.contains(Look::WordBoundaryAscii)); |
| assert!(!f.contains(Look::WordBoundaryAsciiNegate)); |
| |
| f.insert(Look::StartText); |
| assert!(f.contains(Look::StartText)); |
| f.remove(Look::StartText); |
| assert!(!f.contains(Look::StartText)); |
| |
| f.insert(Look::EndText); |
| assert!(f.contains(Look::EndText)); |
| f.remove(Look::EndText); |
| assert!(!f.contains(Look::EndText)); |
| |
| f.insert(Look::StartLine); |
| assert!(f.contains(Look::StartLine)); |
| f.remove(Look::StartLine); |
| assert!(!f.contains(Look::StartLine)); |
| |
| f.insert(Look::EndLine); |
| assert!(f.contains(Look::EndLine)); |
| f.remove(Look::EndLine); |
| assert!(!f.contains(Look::EndLine)); |
| |
| f.insert(Look::WordBoundaryUnicode); |
| assert!(f.contains(Look::WordBoundaryUnicode)); |
| f.remove(Look::WordBoundaryUnicode); |
| assert!(!f.contains(Look::WordBoundaryUnicode)); |
| |
| f.insert(Look::WordBoundaryUnicodeNegate); |
| assert!(f.contains(Look::WordBoundaryUnicodeNegate)); |
| f.remove(Look::WordBoundaryUnicodeNegate); |
| assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); |
| |
| f.insert(Look::WordBoundaryAscii); |
| assert!(f.contains(Look::WordBoundaryAscii)); |
| f.remove(Look::WordBoundaryAscii); |
| assert!(!f.contains(Look::WordBoundaryAscii)); |
| |
| f.insert(Look::WordBoundaryAsciiNegate); |
| assert!(f.contains(Look::WordBoundaryAsciiNegate)); |
| f.remove(Look::WordBoundaryAsciiNegate); |
| assert!(!f.contains(Look::WordBoundaryAsciiNegate)); |
| } |
| |
| #[test] |
| fn look_matches_start_line() { |
| let look = Look::StartLine; |
| |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("\n"), 0)); |
| assert!(look.matches(B("\n"), 1)); |
| assert!(look.matches(B("a"), 0)); |
| assert!(look.matches(B("\na"), 1)); |
| |
| assert!(!look.matches(B("a"), 1)); |
| assert!(!look.matches(B("a\na"), 1)); |
| } |
| |
| #[test] |
| fn look_matches_end_line() { |
| let look = Look::EndLine; |
| |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("\n"), 1)); |
| assert!(look.matches(B("\na"), 0)); |
| assert!(look.matches(B("\na"), 2)); |
| assert!(look.matches(B("a\na"), 1)); |
| |
| assert!(!look.matches(B("a"), 0)); |
| assert!(!look.matches(B("\na"), 1)); |
| assert!(!look.matches(B("a\na"), 0)); |
| assert!(!look.matches(B("a\na"), 2)); |
| } |
| |
| #[test] |
| fn look_matches_start_text() { |
| let look = Look::StartText; |
| |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("\n"), 0)); |
| assert!(look.matches(B("a"), 0)); |
| |
| assert!(!look.matches(B("\n"), 1)); |
| assert!(!look.matches(B("\na"), 1)); |
| assert!(!look.matches(B("a"), 1)); |
| assert!(!look.matches(B("a\na"), 1)); |
| } |
| |
| #[test] |
| fn look_matches_end_text() { |
| let look = Look::EndText; |
| |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("\n"), 1)); |
| assert!(look.matches(B("\na"), 2)); |
| |
| assert!(!look.matches(B("\na"), 0)); |
| assert!(!look.matches(B("a\na"), 1)); |
| assert!(!look.matches(B("a"), 0)); |
| assert!(!look.matches(B("\na"), 1)); |
| assert!(!look.matches(B("a\na"), 0)); |
| assert!(!look.matches(B("a\na"), 2)); |
| } |
| |
| #[test] |
| fn look_matches_word_unicode() { |
| let look = Look::WordBoundaryUnicode; |
| |
| // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| |
| // Simple ASCII word boundaries. |
| assert!(look.matches(B("a"), 0)); |
| assert!(look.matches(B("a"), 1)); |
| assert!(look.matches(B("a "), 1)); |
| assert!(look.matches(B(" a "), 1)); |
| assert!(look.matches(B(" a "), 2)); |
| |
| // Unicode word boundaries with a non-ASCII codepoint. |
| assert!(look.matches(B("𝛃"), 0)); |
| assert!(look.matches(B("𝛃"), 4)); |
| assert!(look.matches(B("𝛃 "), 4)); |
| assert!(look.matches(B(" 𝛃 "), 1)); |
| assert!(look.matches(B(" 𝛃 "), 5)); |
| |
| // Unicode word boundaries between non-ASCII codepoints. |
| assert!(look.matches(B("𝛃𐆀"), 0)); |
| assert!(look.matches(B("𝛃𐆀"), 4)); |
| |
| // Non word boundaries for ASCII. |
| assert!(!look.matches(B(""), 0)); |
| assert!(!look.matches(B("ab"), 1)); |
| assert!(!look.matches(B("a "), 2)); |
| assert!(!look.matches(B(" a "), 0)); |
| assert!(!look.matches(B(" a "), 3)); |
| |
| // Non word boundaries with a non-ASCII codepoint. |
| assert!(!look.matches(B("𝛃b"), 4)); |
| assert!(!look.matches(B("𝛃 "), 5)); |
| assert!(!look.matches(B(" 𝛃 "), 0)); |
| assert!(!look.matches(B(" 𝛃 "), 6)); |
| assert!(!look.matches(B("𝛃"), 1)); |
| assert!(!look.matches(B("𝛃"), 2)); |
| assert!(!look.matches(B("𝛃"), 3)); |
| |
| // Non word boundaries with non-ASCII codepoints. |
| assert!(!look.matches(B("𝛃𐆀"), 1)); |
| assert!(!look.matches(B("𝛃𐆀"), 2)); |
| assert!(!look.matches(B("𝛃𐆀"), 3)); |
| assert!(!look.matches(B("𝛃𐆀"), 5)); |
| assert!(!look.matches(B("𝛃𐆀"), 6)); |
| assert!(!look.matches(B("𝛃𐆀"), 7)); |
| assert!(!look.matches(B("𝛃𐆀"), 8)); |
| } |
| |
| #[test] |
| fn look_matches_word_ascii() { |
| let look = Look::WordBoundaryAscii; |
| |
| // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| |
| // Simple ASCII word boundaries. |
| assert!(look.matches(B("a"), 0)); |
| assert!(look.matches(B("a"), 1)); |
| assert!(look.matches(B("a "), 1)); |
| assert!(look.matches(B(" a "), 1)); |
| assert!(look.matches(B(" a "), 2)); |
| |
| // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| // an ASCII word boundary, none of these match. |
| assert!(!look.matches(B("𝛃"), 0)); |
| assert!(!look.matches(B("𝛃"), 4)); |
| assert!(!look.matches(B("𝛃 "), 4)); |
| assert!(!look.matches(B(" 𝛃 "), 1)); |
| assert!(!look.matches(B(" 𝛃 "), 5)); |
| |
| // Unicode word boundaries between non-ASCII codepoints. Again, since |
| // this is an ASCII word boundary, none of these match. |
| assert!(!look.matches(B("𝛃𐆀"), 0)); |
| assert!(!look.matches(B("𝛃𐆀"), 4)); |
| |
| // Non word boundaries for ASCII. |
| assert!(!look.matches(B(""), 0)); |
| assert!(!look.matches(B("ab"), 1)); |
| assert!(!look.matches(B("a "), 2)); |
| assert!(!look.matches(B(" a "), 0)); |
| assert!(!look.matches(B(" a "), 3)); |
| |
| // Non word boundaries with a non-ASCII codepoint. |
| assert!(look.matches(B("𝛃b"), 4)); |
| assert!(!look.matches(B("𝛃 "), 5)); |
| assert!(!look.matches(B(" 𝛃 "), 0)); |
| assert!(!look.matches(B(" 𝛃 "), 6)); |
| assert!(!look.matches(B("𝛃"), 1)); |
| assert!(!look.matches(B("𝛃"), 2)); |
| assert!(!look.matches(B("𝛃"), 3)); |
| |
| // Non word boundaries with non-ASCII codepoints. |
| assert!(!look.matches(B("𝛃𐆀"), 1)); |
| assert!(!look.matches(B("𝛃𐆀"), 2)); |
| assert!(!look.matches(B("𝛃𐆀"), 3)); |
| assert!(!look.matches(B("𝛃𐆀"), 5)); |
| assert!(!look.matches(B("𝛃𐆀"), 6)); |
| assert!(!look.matches(B("𝛃𐆀"), 7)); |
| assert!(!look.matches(B("𝛃𐆀"), 8)); |
| } |
| |
| #[test] |
| fn look_matches_word_unicode_negate() { |
| let look = Look::WordBoundaryUnicodeNegate; |
| |
| // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| |
| // Simple ASCII word boundaries. |
| assert!(!look.matches(B("a"), 0)); |
| assert!(!look.matches(B("a"), 1)); |
| assert!(!look.matches(B("a "), 1)); |
| assert!(!look.matches(B(" a "), 1)); |
| assert!(!look.matches(B(" a "), 2)); |
| |
| // Unicode word boundaries with a non-ASCII codepoint. |
| assert!(!look.matches(B("𝛃"), 0)); |
| assert!(!look.matches(B("𝛃"), 4)); |
| assert!(!look.matches(B("𝛃 "), 4)); |
| assert!(!look.matches(B(" 𝛃 "), 1)); |
| assert!(!look.matches(B(" 𝛃 "), 5)); |
| |
| // Unicode word boundaries between non-ASCII codepoints. |
| assert!(!look.matches(B("𝛃𐆀"), 0)); |
| assert!(!look.matches(B("𝛃𐆀"), 4)); |
| |
| // Non word boundaries for ASCII. |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("ab"), 1)); |
| assert!(look.matches(B("a "), 2)); |
| assert!(look.matches(B(" a "), 0)); |
| assert!(look.matches(B(" a "), 3)); |
| |
| // Non word boundaries with a non-ASCII codepoint. |
| assert!(look.matches(B("𝛃b"), 4)); |
| assert!(look.matches(B("𝛃 "), 5)); |
| assert!(look.matches(B(" 𝛃 "), 0)); |
| assert!(look.matches(B(" 𝛃 "), 6)); |
| // These don't match because they could otherwise return an offset that |
| // splits the UTF-8 encoding of a codepoint. |
| assert!(!look.matches(B("𝛃"), 1)); |
| assert!(!look.matches(B("𝛃"), 2)); |
| assert!(!look.matches(B("𝛃"), 3)); |
| |
| // Non word boundaries with non-ASCII codepoints. These also don't |
| // match because they could otherwise return an offset that splits the |
| // UTF-8 encoding of a codepoint. |
| assert!(!look.matches(B("𝛃𐆀"), 1)); |
| assert!(!look.matches(B("𝛃𐆀"), 2)); |
| assert!(!look.matches(B("𝛃𐆀"), 3)); |
| assert!(!look.matches(B("𝛃𐆀"), 5)); |
| assert!(!look.matches(B("𝛃𐆀"), 6)); |
| assert!(!look.matches(B("𝛃𐆀"), 7)); |
| // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end |
| // of the haystack. So the "end" of the haystack isn't a word and 𐆀 |
| // isn't a word, thus, \B matches. |
| assert!(look.matches(B("𝛃𐆀"), 8)); |
| } |
| |
| #[test] |
| fn look_matches_word_ascii_negate() { |
| let look = Look::WordBoundaryAsciiNegate; |
| |
| // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| |
| // Simple ASCII word boundaries. |
| assert!(!look.matches(B("a"), 0)); |
| assert!(!look.matches(B("a"), 1)); |
| assert!(!look.matches(B("a "), 1)); |
| assert!(!look.matches(B(" a "), 1)); |
| assert!(!look.matches(B(" a "), 2)); |
| |
| // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| // an ASCII word boundary, none of these match. |
| assert!(look.matches(B("𝛃"), 0)); |
| assert!(look.matches(B("𝛃"), 4)); |
| assert!(look.matches(B("𝛃 "), 4)); |
| assert!(look.matches(B(" 𝛃 "), 1)); |
| assert!(look.matches(B(" 𝛃 "), 5)); |
| |
| // Unicode word boundaries between non-ASCII codepoints. Again, since |
| // this is an ASCII word boundary, none of these match. |
| assert!(look.matches(B("𝛃𐆀"), 0)); |
| assert!(look.matches(B("𝛃𐆀"), 4)); |
| |
| // Non word boundaries for ASCII. |
| assert!(look.matches(B(""), 0)); |
| assert!(look.matches(B("ab"), 1)); |
| assert!(look.matches(B("a "), 2)); |
| assert!(look.matches(B(" a "), 0)); |
| assert!(look.matches(B(" a "), 3)); |
| |
| // Non word boundaries with a non-ASCII codepoint. |
| assert!(!look.matches(B("𝛃b"), 4)); |
| assert!(look.matches(B("𝛃 "), 5)); |
| assert!(look.matches(B(" 𝛃 "), 0)); |
| assert!(look.matches(B(" 𝛃 "), 6)); |
| assert!(look.matches(B("𝛃"), 1)); |
| assert!(look.matches(B("𝛃"), 2)); |
| assert!(look.matches(B("𝛃"), 3)); |
| |
| // Non word boundaries with non-ASCII codepoints. |
| assert!(look.matches(B("𝛃𐆀"), 1)); |
| assert!(look.matches(B("𝛃𐆀"), 2)); |
| assert!(look.matches(B("𝛃𐆀"), 3)); |
| assert!(look.matches(B("𝛃𐆀"), 5)); |
| assert!(look.matches(B("𝛃𐆀"), 6)); |
| assert!(look.matches(B("𝛃𐆀"), 7)); |
| assert!(look.matches(B("𝛃𐆀"), 8)); |
| } |
| |
| fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { |
| string.as_ref() |
| } |
| } |