| /*! |
| This module defines a sparse set data structure. Its most interesting |
| properties are: |
| |
| * They preserve insertion order. |
| * Set membership testing is done in constant time. |
| * Set insertion is done in constant time. |
| * Clearing the set is done in constant time. |
| |
| The cost for doing this is that the capacity of the set needs to be known up |
| front, and the elements in the set are limited to state identifiers. |
| |
| These sets are principally used when traversing an NFA state graph. This |
| happens at search time, for example, in the PikeVM. It also happens during DFA |
| determinization. |
| */ |
| |
| use alloc::{vec, vec::Vec}; |
| |
| use crate::util::primitives::StateID; |
| |
| /// A pairse of sparse sets. |
| /// |
| /// This is useful when one needs to compute NFA epsilon closures from a |
| /// previous set of states derived from an epsilon closure. One set can be the |
| /// starting states where as the other set can be the destination states after |
| /// following the transitions for a particular byte of input. |
| /// |
| /// There is no significance to 'set1' or 'set2'. They are both sparse sets of |
| /// the same size. |
| /// |
| /// The members of this struct are exposed so that callers may borrow 'set1' |
| /// and 'set2' individually without being force to borrow both at the same |
| /// time. |
| #[derive(Clone, Debug)] |
| pub(crate) struct SparseSets { |
| pub(crate) set1: SparseSet, |
| pub(crate) set2: SparseSet, |
| } |
| |
| impl SparseSets { |
| /// Create a new pair of sparse sets where each set has the given capacity. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| pub(crate) fn new(capacity: usize) -> SparseSets { |
| SparseSets { |
| set1: SparseSet::new(capacity), |
| set2: SparseSet::new(capacity), |
| } |
| } |
| |
| /// Resizes these sparse sets to have the new capacity given. |
| /// |
| /// The sets are automatically cleared. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| #[inline] |
| pub(crate) fn resize(&mut self, new_capacity: usize) { |
| self.set1.resize(new_capacity); |
| self.set2.resize(new_capacity); |
| } |
| |
| /// Clear both sparse sets. |
| pub(crate) fn clear(&mut self) { |
| self.set1.clear(); |
| self.set2.clear(); |
| } |
| |
| /// Swap set1 with set2. |
| pub(crate) fn swap(&mut self) { |
| core::mem::swap(&mut self.set1, &mut self.set2); |
| } |
| |
| /// Returns the memory usage, in bytes, used by this pair of sparse sets. |
| pub(crate) fn memory_usage(&self) -> usize { |
| self.set1.memory_usage() + self.set2.memory_usage() |
| } |
| } |
| |
| /// A sparse set used for representing ordered NFA states. |
| /// |
| /// This supports constant time addition and membership testing. Clearing an |
| /// entire set can also be done in constant time. Iteration yields elements |
| /// in the order in which they were inserted. |
| /// |
| /// The data structure is based on: https://research.swtch.com/sparse |
| /// Note though that we don't actually use uninitialized memory. We generally |
| /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
| /// other properties listed above are extremely useful. |
| #[derive(Clone)] |
| pub(crate) struct SparseSet { |
| /// The number of elements currently in this set. |
| len: usize, |
| /// Dense contains the ids in the order in which they were inserted. |
| dense: Vec<StateID>, |
| /// Sparse maps ids to their location in dense. |
| /// |
| /// A state ID is in the set if and only if |
| /// sparse[id] < len && id == dense[sparse[id]]. |
| /// |
| /// Note that these are indices into 'dense'. It's a little weird to use |
| /// StateID here, but we know our length can never exceed the bounds of |
| /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
| /// where as a usize is likely double that in most cases. |
| sparse: Vec<StateID>, |
| } |
| |
| impl SparseSet { |
| /// Create a new sparse set with the given capacity. |
| /// |
| /// Sparse sets have a fixed size and they cannot grow. Attempting to |
| /// insert more distinct elements than the total capacity of the set will |
| /// result in a panic. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| #[inline] |
| pub(crate) fn new(capacity: usize) -> SparseSet { |
| let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
| set.resize(capacity); |
| set |
| } |
| |
| /// Resizes this sparse set to have the new capacity given. |
| /// |
| /// This set is automatically cleared. |
| /// |
| /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| #[inline] |
| pub(crate) fn resize(&mut self, new_capacity: usize) { |
| assert!( |
| new_capacity <= StateID::LIMIT, |
| "sparse set capacity cannot excced {:?}", |
| StateID::LIMIT |
| ); |
| self.clear(); |
| self.dense.resize(new_capacity, StateID::ZERO); |
| self.sparse.resize(new_capacity, StateID::ZERO); |
| } |
| |
| /// Returns the capacity of this set. |
| /// |
| /// The capacity represents a fixed limit on the number of distinct |
| /// elements that are allowed in this set. The capacity cannot be changed. |
| #[inline] |
| pub(crate) fn capacity(&self) -> usize { |
| self.dense.len() |
| } |
| |
| /// Returns the number of elements in this set. |
| #[inline] |
| pub(crate) fn len(&self) -> usize { |
| self.len |
| } |
| |
| /// Returns true if and only if this set is empty. |
| #[inline] |
| pub(crate) fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| /// Insert the state ID value into this set and return true if the given |
| /// state ID was not previously in this set. |
| /// |
| /// This operation is idempotent. If the given value is already in this |
| /// set, then this is a no-op. |
| /// |
| /// If more than `capacity` ids are inserted, then this panics. |
| /// |
| /// This is marked as inline(always) since the compiler won't inline it |
| /// otherwise, and it's a fairly hot piece of code in DFA determinization. |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| pub(crate) fn insert(&mut self, id: StateID) -> bool { |
| if self.contains(id) { |
| return false; |
| } |
| |
| let i = self.len(); |
| assert!( |
| i < self.capacity(), |
| "{:?} exceeds capacity of {:?} when inserting {:?}", |
| i, |
| self.capacity(), |
| id, |
| ); |
| // OK since i < self.capacity() and self.capacity() is guaranteed to |
| // be <= StateID::LIMIT. |
| let index = StateID::new_unchecked(i); |
| self.dense[index] = id; |
| self.sparse[id] = index; |
| self.len += 1; |
| true |
| } |
| |
| /// Returns true if and only if this set contains the given value. |
| #[inline] |
| pub(crate) fn contains(&self, id: StateID) -> bool { |
| let index = self.sparse[id]; |
| index.as_usize() < self.len() && self.dense[index] == id |
| } |
| |
| /// Clear this set such that it has no members. |
| #[inline] |
| pub(crate) fn clear(&mut self) { |
| self.len = 0; |
| } |
| |
| #[inline] |
| pub(crate) fn iter(&self) -> SparseSetIter<'_> { |
| SparseSetIter(self.dense[..self.len()].iter()) |
| } |
| |
| /// Returns the heap memory usage, in bytes, used by this sparse set. |
| #[inline] |
| pub(crate) fn memory_usage(&self) -> usize { |
| self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE |
| } |
| } |
| |
| impl core::fmt::Debug for SparseSet { |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| let elements: Vec<StateID> = self.iter().collect(); |
| f.debug_tuple("SparseSet").field(&elements).finish() |
| } |
| } |
| |
| /// An iterator over all elements in a sparse set. |
| /// |
| /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
| #[derive(Debug)] |
| pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
| |
| impl<'a> Iterator for SparseSetIter<'a> { |
| type Item = StateID; |
| |
| #[cfg_attr(feature = "perf-inline", inline(always))] |
| fn next(&mut self) -> Option<StateID> { |
| self.0.next().map(|&id| id) |
| } |
| } |