| use core::fmt; |
| |
| /// A representation of byte oriented equivalence classes. |
| /// |
| /// This is used in a DFA to reduce the size of the transition table. This can |
| /// have a particularly large impact not only on the total size of a dense DFA, |
| /// but also on compile times. |
| #[derive(Clone, Copy)] |
| pub struct ByteClasses([u8; 256]); |
| |
| impl ByteClasses { |
| /// Creates a new set of equivalence classes where all bytes are mapped to |
| /// the same class. |
| pub fn empty() -> ByteClasses { |
| ByteClasses([0; 256]) |
| } |
| |
| /// Creates a new set of equivalence classes where each byte belongs to |
| /// its own equivalence class. |
| pub fn singletons() -> ByteClasses { |
| let mut classes = ByteClasses::empty(); |
| for i in 0..256 { |
| classes.set(i as u8, i as u8); |
| } |
| classes |
| } |
| |
| /// Copies the byte classes given. The given slice must have length 0 or |
| /// length 256. Slices of length 0 are treated as singletons (every byte |
| /// is its own class). |
| pub fn from_slice(slice: &[u8]) -> ByteClasses { |
| assert!(slice.is_empty() || slice.len() == 256); |
| |
| if slice.is_empty() { |
| ByteClasses::singletons() |
| } else { |
| let mut classes = ByteClasses::empty(); |
| for (b, &class) in slice.iter().enumerate() { |
| classes.set(b as u8, class); |
| } |
| classes |
| } |
| } |
| |
| /// Set the equivalence class for the given byte. |
| #[inline] |
| pub fn set(&mut self, byte: u8, class: u8) { |
| self.0[byte as usize] = class; |
| } |
| |
| /// Get the equivalence class for the given byte. |
| #[inline] |
| pub fn get(&self, byte: u8) -> u8 { |
| self.0[byte as usize] |
| } |
| |
| /// Get the equivalence class for the given byte while forcefully |
| /// eliding bounds checks. |
| #[inline] |
| pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { |
| *self.0.get_unchecked(byte as usize) |
| } |
| |
| /// Return the total number of elements in the alphabet represented by |
| /// these equivalence classes. Equivalently, this returns the total number |
| /// of equivalence classes. |
| #[inline] |
| pub fn alphabet_len(&self) -> usize { |
| self.0[255] as usize + 1 |
| } |
| |
| /// Returns true if and only if every byte in this class maps to its own |
| /// equivalence class. Equivalently, there are 256 equivalence classes |
| /// and each class contains exactly one byte. |
| #[inline] |
| pub fn is_singleton(&self) -> bool { |
| self.alphabet_len() == 256 |
| } |
| |
| /// Returns an iterator over a sequence of representative bytes from each |
| /// equivalence class. Namely, this yields exactly N items, where N is |
| /// equivalent to the number of equivalence classes. Each item is an |
| /// arbitrary byte drawn from each equivalence class. |
| /// |
| /// This is useful when one is determinizing an NFA and the NFA's alphabet |
| /// hasn't been converted to equivalence classes yet. Picking an arbitrary |
| /// byte from each equivalence class then permits a full exploration of |
| /// the NFA instead of using every possible byte value. |
| #[cfg(feature = "std")] |
| pub fn representatives(&self) -> ByteClassRepresentatives { |
| ByteClassRepresentatives { classes: self, byte: 0, last_class: None } |
| } |
| |
| /// Returns all of the bytes in the given equivalence class. |
| /// |
| /// The second element in the tuple indicates the number of elements in |
| /// the array. |
| fn elements(&self, equiv: u8) -> ([u8; 256], usize) { |
| let (mut array, mut len) = ([0; 256], 0); |
| for b in 0..256 { |
| if self.get(b as u8) == equiv { |
| array[len] = b as u8; |
| len += 1; |
| } |
| } |
| (array, len) |
| } |
| } |
| |
| impl fmt::Debug for ByteClasses { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if self.is_singleton() { |
| write!(f, "ByteClasses({{singletons}})") |
| } else { |
| write!(f, "ByteClasses(")?; |
| for equiv in 0..self.alphabet_len() { |
| let (members, len) = self.elements(equiv as u8); |
| write!(f, "{} => {:?}", equiv, &members[..len])?; |
| } |
| write!(f, ")") |
| } |
| } |
| } |
| |
| /// An iterator over representative bytes from each equivalence class. |
| #[cfg(feature = "std")] |
| #[derive(Debug)] |
| pub struct ByteClassRepresentatives<'a> { |
| classes: &'a ByteClasses, |
| byte: usize, |
| last_class: Option<u8>, |
| } |
| |
| #[cfg(feature = "std")] |
| impl<'a> Iterator for ByteClassRepresentatives<'a> { |
| type Item = u8; |
| |
| fn next(&mut self) -> Option<u8> { |
| while self.byte < 256 { |
| let byte = self.byte as u8; |
| let class = self.classes.get(byte); |
| self.byte += 1; |
| |
| if self.last_class != Some(class) { |
| self.last_class = Some(class); |
| return Some(byte); |
| } |
| } |
| None |
| } |
| } |
| |
| /// A byte class set keeps track of an *approximation* of equivalence classes |
| /// of bytes during NFA construction. That is, every byte in an equivalence |
| /// class cannot discriminate between a match and a non-match. |
| /// |
| /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the |
| /// same equivalence class because it never matters whether an `a` or a `b` is |
| /// seen, and no combination of `a`s and `b`s in the text can discriminate |
| /// a match. |
| /// |
| /// Note though that this does not compute the minimal set of equivalence |
| /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the |
| /// same equivalence class for the same reason that `a` and `b` are in the |
| /// same equivalence class in the aforementioned regex. However, in this |
| /// implementation, `a` and `c` are put into distinct equivalence classes. |
| /// The reason for this is implementation complexity. In the future, we should |
| /// endeavor to compute the minimal equivalence classes since they can have a |
| /// rather large impact on the size of the DFA. |
| /// |
| /// The representation here is 256 booleans, all initially set to false. Each |
| /// boolean maps to its corresponding byte based on position. A `true` value |
| /// indicates the end of an equivalence class, where its corresponding byte |
| /// and all of the bytes corresponding to all previous contiguous `false` |
| /// values are in the same equivalence class. |
| /// |
| /// This particular representation only permits contiguous ranges of bytes to |
| /// be in the same equivalence class, which means that we can never discover |
| /// the true minimal set of equivalence classes. |
| #[cfg(feature = "std")] |
| #[derive(Debug)] |
| pub struct ByteClassSet(Vec<bool>); |
| |
| #[cfg(feature = "std")] |
| impl ByteClassSet { |
| /// Create a new set of byte classes where all bytes are part of the same |
| /// equivalence class. |
| pub fn new() -> Self { |
| ByteClassSet(vec![false; 256]) |
| } |
| |
| /// Indicate the the range of byte given (inclusive) can discriminate a |
| /// match between it and all other bytes outside of the range. |
| pub fn set_range(&mut self, start: u8, end: u8) { |
| debug_assert!(start <= end); |
| if start > 0 { |
| self.0[start as usize - 1] = true; |
| } |
| self.0[end as usize] = true; |
| } |
| |
| /// Convert this boolean set to a map that maps all byte values to their |
| /// corresponding equivalence class. The last mapping indicates the largest |
| /// equivalence class identifier (which is never bigger than 255). |
| pub fn byte_classes(&self) -> ByteClasses { |
| let mut classes = ByteClasses::empty(); |
| let mut class = 0u8; |
| let mut i = 0; |
| loop { |
| classes.set(i as u8, class as u8); |
| if i >= 255 { |
| break; |
| } |
| if self.0[i] { |
| class = class.checked_add(1).unwrap(); |
| } |
| i += 1; |
| } |
| classes |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| #[cfg(feature = "std")] |
| #[test] |
| fn byte_classes() { |
| use super::ByteClassSet; |
| |
| let mut set = ByteClassSet::new(); |
| set.set_range(b'a', b'z'); |
| |
| let classes = set.byte_classes(); |
| assert_eq!(classes.get(0), 0); |
| assert_eq!(classes.get(1), 0); |
| assert_eq!(classes.get(2), 0); |
| assert_eq!(classes.get(b'a' - 1), 0); |
| assert_eq!(classes.get(b'a'), 1); |
| assert_eq!(classes.get(b'm'), 1); |
| assert_eq!(classes.get(b'z'), 1); |
| assert_eq!(classes.get(b'z' + 1), 2); |
| assert_eq!(classes.get(254), 2); |
| assert_eq!(classes.get(255), 2); |
| |
| let mut set = ByteClassSet::new(); |
| set.set_range(0, 2); |
| set.set_range(4, 6); |
| let classes = set.byte_classes(); |
| assert_eq!(classes.get(0), 0); |
| assert_eq!(classes.get(1), 0); |
| assert_eq!(classes.get(2), 0); |
| assert_eq!(classes.get(3), 1); |
| assert_eq!(classes.get(4), 2); |
| assert_eq!(classes.get(5), 2); |
| assert_eq!(classes.get(6), 2); |
| assert_eq!(classes.get(7), 3); |
| assert_eq!(classes.get(255), 3); |
| } |
| |
| #[cfg(feature = "std")] |
| #[test] |
| fn full_byte_classes() { |
| use super::ByteClassSet; |
| |
| let mut set = ByteClassSet::new(); |
| for i in 0..256u16 { |
| set.set_range(i as u8, i as u8); |
| } |
| assert_eq!(set.byte_classes().alphabet_len(), 256); |
| } |
| } |