vendor/regex-lite-0.1.6/src/nfa.rs - toolchain/rustc - Git at Google

 use core::{cell::RefCell, mem::size_of};

 use alloc::{string::String, sync::Arc, vec, vec::Vec};

 use crate::{
     error::Error,
     hir::{self, Hir, HirKind},
     int::U32,
 };

 pub(crate) type StateID = u32;

 #[derive(Clone, Copy, Debug)]
 pub(crate) struct Config {
     pub(crate) size_limit: Option<usize>,
 }

 impl Default for Config {
     fn default() -> Config {
         Config { size_limit: Some(10 * (1 << 20)) }
     }
 }

 #[derive(Clone)]
 pub(crate) struct NFA {
     /// The pattern string this NFA was generated from.
     ///
     /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯
     pattern: String,
     /// The states that make up this NFA.
     states: Vec<State>,
     /// The ID of the start state.
     start: StateID,
     /// Whether this NFA can only match at the beginning of a haystack.
     is_start_anchored: bool,
     /// Whether this NFA can match the empty string.
     is_match_empty: bool,
     /// If every match has the same number of matching capture groups, then
     /// this corresponds to the number of groups.
     static_explicit_captures_len: Option<usize>,
     /// A map from capture group name to its corresponding index.
     cap_name_to_index: CaptureNameMap,
     /// A map from capture group index to the corresponding name, if one
     /// exists.
     cap_index_to_name: Vec<Option<Arc<str>>>,
     /// Heap memory used indirectly by NFA states and other things (like the
     /// various capturing group representations above). Since each state
     /// might use a different amount of heap, we need to keep track of this
     /// incrementally.
     memory_extra: usize,
 }

 impl NFA {
     /// Creates a new NFA from the given configuration and HIR.
     pub(crate) fn new(
         config: Config,
         pattern: String,
         hir: &Hir,
     ) -> Result<NFA, Error> {
         Compiler::new(config, pattern).compile(hir)
     }

     /// Returns the pattern string used to construct this NFA.
     pub(crate) fn pattern(&self) -> &str {
         &self.pattern
     }

     /// Returns the state corresponding to the given ID.
     ///
     /// # Panics
     ///
     /// If the ID does not refer to a valid state, then this panics.
     pub(crate) fn state(&self, id: StateID) -> &State {
         &self.states[id.as_usize()]
     }

     /// Returns the total number of states in this NFA.
     pub(crate) fn len(&self) -> usize {
         self.states.len()
     }

     /// Returns the ID of the starting state for this NFA.
     pub(crate) fn start(&self) -> StateID {
         self.start
     }

     /// Returns the capture group index for the corresponding named group.
     /// If no such group with the given name exists, then `None` is returned.
     pub(crate) fn to_index(&self, name: &str) -> Option<usize> {
         self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize())
     }

     /*
     /// Returns the capture group name for the corresponding index.
     /// If no such group with the given index, then `None` is returned.
     pub(crate) fn to_name(&self, index: usize) -> Option<&str> {
         self.cap_index_to_name.get(index)?.as_deref()
     }
     */

     /// Returns an iterator over all of the capture groups, along with their
     /// names if they exist, in this NFA.
     pub(crate) fn capture_names(&self) -> CaptureNames<'_> {
         CaptureNames { it: self.cap_index_to_name.iter() }
     }

     /// Returns the total number of capture groups, including the first and
     /// implicit group, in this NFA.
     pub(crate) fn group_len(&self) -> usize {
         self.cap_index_to_name.len()
     }

     /// Returns true if and only if this NFA can only match at the beginning of
     /// a haystack.
     pub(crate) fn is_start_anchored(&self) -> bool {
         self.is_start_anchored
     }

     /// If the pattern always reports the same number of matching capture groups
     /// for every match, then this returns the number of those groups. This
     /// doesn't include the implicit group found in every pattern.
     pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
         self.static_explicit_captures_len
     }

     /// Returns the heap memory usage, in bytes, used by this NFA.
     fn memory_usage(&self) -> usize {
         (self.states.len() * size_of::<State>())
             + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>())
             + self.memory_extra
     }
 }

 impl core::fmt::Debug for NFA {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         writeln!(f, "NFA(")?;
         writeln!(f, "pattern: {}", self.pattern)?;
         for (sid, state) in self.states.iter().enumerate() {
             writeln!(f, "{:07?}: {:?}", sid, state)?;
         }
         writeln!(f, ")")?;
         Ok(())
     }
 }

 /// An iterator over all capture groups in an NFA.
 ///
 /// If a particular group has a name, then it is yielded. Otherwise, `None`
 /// is yielded.
 #[derive(Clone, Debug)]
 pub(crate) struct CaptureNames<'a> {
     it: core::slice::Iter<'a, Option<Arc<str>>>,
 }

 impl<'a> Iterator for CaptureNames<'a> {
     type Item = Option<&'a str>;

     fn next(&mut self) -> Option<Option<&'a str>> {
         self.it.next().map(|n| n.as_deref())
     }
 }

 #[derive(Clone, Eq, PartialEq)]
 pub(crate) enum State {
     Char { target: StateID, ch: char },
     Ranges { target: StateID, ranges: Vec<(char, char)> },
     Splits { targets: Vec<StateID>, reverse: bool },
     Goto { target: StateID, look: Option<hir::Look> },
     Capture { target: StateID, slot: u32 },
     Fail,
     Match,
 }

 impl State {
     /// Returns the heap memory usage of this NFA state in bytes.
     fn memory_usage(&self) -> usize {
         match *self {
             State::Char { .. }
             | State::Goto { .. }
             | State::Capture { .. }
             | State::Fail { .. }
             | State::Match => 0,
             State::Splits { ref targets, .. } => {
                 targets.len() * size_of::<StateID>()
             }
             State::Ranges { ref ranges, .. } => {
                 ranges.len() * size_of::<(char, char)>()
             }
         }
     }

     /// Returns an iterator over the given split targets. The order of the
     /// iterator yields elements in reverse when `reverse` is true.
     pub(crate) fn iter_splits<'a>(
         splits: &'a [StateID],
         reverse: bool,
     ) -> impl Iterator<Item = StateID> + 'a {
         let mut it = splits.iter();
         core::iter::from_fn(move || {
             if reverse { it.next_back() } else { it.next() }.copied()
         })
     }
 }

 impl core::fmt::Debug for State {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         match *self {
             State::Char { target, ch } => {
                 write!(f, "{:?} => {:?}", ch, target)
             }
             State::Ranges { target, ref ranges } => {
                 for (i, &(start, end)) in ranges.iter().enumerate() {
                     if i > 0 {
                         write!(f, ", ")?;
                     }
                     write!(f, "{:?}-{:?} => {:?}", start, end, target)?;
                 }
                 Ok(())
             }
             State::Splits { ref targets, reverse } => {
                 write!(f, "splits(")?;
                 for (i, sid) in
                     State::iter_splits(targets, reverse).enumerate()
                 {
                     if i > 0 {
                         write!(f, ", ")?;
                     }
                     write!(f, "{:?}", sid)?;
                 }
                 write!(f, ")")
             }
             State::Goto { target, look: None } => {
                 write!(f, "goto({:?})", target)
             }
             State::Goto { target, look: Some(look) } => {
                 write!(f, "{:?} => {:?}", look, target)
             }
             State::Capture { target, slot } => {
                 write!(f, "capture(slot={:?}) => {:?}", slot, target,)
             }
             State::Fail => write!(f, "FAIL"),
             State::Match => {
                 write!(f, "MATCH")
             }
         }
     }
 }

 /// A map from capture group name to its corresponding capture group index.
 ///
 /// We define a type alias here so that we can transparently use a `HashMap`
 /// whenever it's available. We do so presumably because it's faster, although
 /// there are no benchmarks verifying this.
 #[cfg(feature = "std")]
 type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>;
 #[cfg(not(feature = "std"))]
 type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>;

 #[derive(Debug)]
 struct Compiler {
     config: Config,
     nfa: RefCell<NFA>,
 }

 impl Compiler {
     fn new(config: Config, pattern: String) -> Compiler {
         let nfa = RefCell::new(NFA {
             pattern,
             states: vec![],
             start: 0,
             is_start_anchored: false,
             is_match_empty: false,
             static_explicit_captures_len: None,
             cap_name_to_index: CaptureNameMap::default(),
             cap_index_to_name: vec![],
             memory_extra: 0,
         });
         Compiler { config, nfa }
     }

     fn compile(self, hir: &Hir) -> Result<NFA, Error> {
         self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored();
         self.nfa.borrow_mut().is_match_empty = hir.is_match_empty();
         self.nfa.borrow_mut().static_explicit_captures_len =
             hir.static_explicit_captures_len();
         let compiled = self.c_capture(0, None, hir)?;
         let mat = self.add(State::Match)?;
         self.patch(compiled.end, mat)?;
         self.nfa.borrow_mut().start = compiled.start;
         Ok(self.nfa.into_inner())
     }

     fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> {
         match *hir.kind() {
             HirKind::Empty => self.c_empty(),
             HirKind::Char(ch) => self.c_char(ch),
             HirKind::Class(ref class) => self.c_class(class),
             HirKind::Look(ref look) => self.c_look(look),
             HirKind::Repetition(ref rep) => self.c_repetition(rep),
             HirKind::Capture(ref cap) => {
                 self.c_capture(cap.index, cap.name.as_deref(), &cap.sub)
             }
             HirKind::Concat(ref subs) => {
                 self.c_concat(subs.iter().map(|s| self.c(s)))
             }
             HirKind::Alternation(ref subs) => {
                 self.c_alternation(subs.iter().map(|s| self.c(s)))
             }
         }
     }

     /// Compile a "fail" state that can never be transitioned out of.
     fn c_fail(&self) -> Result<ThompsonRef, Error> {
         let id = self.add(State::Fail)?;
         Ok(ThompsonRef { start: id, end: id })
     }

     /// Compile an "empty" state with one unconditional epsilon transition.
     ///
     /// Both the `start` and `end` locations point to the state created.
     /// Callers will likely want to keep the `start`, but patch the `end` to
     /// point to some other state.
     fn c_empty(&self) -> Result<ThompsonRef, Error> {
         let id = self.add_empty()?;
         Ok(ThompsonRef { start: id, end: id })
     }

     /// Compile the given literal char to an NFA.
     fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
         let id = self.add(State::Char { target: 0, ch })?;
         Ok(ThompsonRef { start: id, end: id })
     }

     /// Compile the given character class into an NFA.
     ///
     /// If the class is empty, then this compiles to a `Fail` state.
     fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> {
         let id = if class.ranges.is_empty() {
             // Technically using an explicit fail state probably isn't
             // necessary. Because if you try to match against an empty Ranges,
             // then it should turn up with nothing regardless of input, and
             // thus "acts" like a Fail state. But it's better to be more
             // explicit, and there's no real cost to doing so.
             self.add(State::Fail)
         } else {
             let ranges =
                 class.ranges.iter().map(|r| (r.start, r.end)).collect();
             self.add(State::Ranges { target: 0, ranges })
         }?;
         Ok(ThompsonRef { start: id, end: id })
     }

     /// Compile the given HIR look-around assertion to an NFA look-around
     /// assertion.
     fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> {
         let id = self.add(State::Goto { target: 0, look: Some(*look) })?;
         Ok(ThompsonRef { start: id, end: id })
     }

     /// Compile the given repetition expression. This handles all types of
     /// repetitions and greediness.
     fn c_repetition(
         &self,
         rep: &hir::Repetition,
     ) -> Result<ThompsonRef, Error> {
         match (rep.min, rep.max) {
             (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
             (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
             (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
             (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
         }
     }

     /// Compile the given expression such that it matches at least `min` times,
     /// but no more than `max` times.
     ///
     /// When `greedy` is true, then the preference is for the expression to
     /// match as much as possible. Otherwise, it will match as little as
     /// possible.
     fn c_bounded(
         &self,
         hir: &Hir,
         greedy: bool,
         min: u32,
         max: u32,
     ) -> Result<ThompsonRef, Error> {
         let prefix = self.c_exactly(hir, min)?;
         if min == max {
             return Ok(prefix);
         }

         // It is tempting here to compile the rest here as a concatenation
         // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
         // were `aaa?a?a?`. The problem here is that it leads to this program:
         //
         //     >000000: 61 => 01
         //      000001: 61 => 02
         //      000002: union(03, 04)
         //      000003: 61 => 04
         //      000004: union(05, 06)
         //      000005: 61 => 06
         //      000006: union(07, 08)
         //      000007: 61 => 08
         //      000008: MATCH
         //
         // And effectively, once you hit state 2, the epsilon closure will
         // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
         // to instead compile it like so:
         //
         //     >000000: 61 => 01
         //      000001: 61 => 02
         //      000002: union(03, 08)
         //      000003: 61 => 04
         //      000004: union(05, 08)
         //      000005: 61 => 06
         //      000006: union(07, 08)
         //      000007: 61 => 08
         //      000008: MATCH
         //
         // So that the epsilon closure of state 2 is now just 3 and 8.
         let empty = self.add_empty()?;
         let mut prev_end = prefix.end;
         for _ in min..max {
             let splits =
                 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
             let compiled = self.c(hir)?;
             self.patch(prev_end, splits)?;
             self.patch(splits, compiled.start)?;
             self.patch(splits, empty)?;
             prev_end = compiled.end;
         }
         self.patch(prev_end, empty)?;
         Ok(ThompsonRef { start: prefix.start, end: empty })
     }

     /// Compile the given expression such that it may be matched `n` or more
     /// times, where `n` can be any integer. (Although a particularly large
     /// integer is likely to run afoul of any configured size limits.)
     ///
     /// When `greedy` is true, then the preference is for the expression to
     /// match as much as possible. Otherwise, it will match as little as
     /// possible.
     fn c_at_least(
         &self,
         hir: &Hir,
         greedy: bool,
         n: u32,
     ) -> Result<ThompsonRef, Error> {
         if n == 0 {
             // When the expression cannot match the empty string, then we
             // can get away with something much simpler: just one 'alt'
             // instruction that optionally repeats itself. But if the expr
             // can match the empty string... see below.
             if !hir.is_match_empty() {
                 let splits = self.add(State::Splits {
                     targets: vec![],
                     reverse: !greedy,
                 })?;
                 let compiled = self.c(hir)?;
                 self.patch(splits, compiled.start)?;
                 self.patch(compiled.end, splits)?;
                 return Ok(ThompsonRef { start: splits, end: splits });
             }

             // What's going on here? Shouldn't x* be simpler than this? It
             // turns out that when implementing leftmost-first (Perl-like)
             // match semantics, x* results in an incorrect preference order
             // when computing the transitive closure of states if and only if
             // 'x' can match the empty string. So instead, we compile x* as
             // (x+)?, which preserves the correct preference order.
             //
             // See: https://github.com/rust-lang/regex/issues/779
             let compiled = self.c(hir)?;
             let plus =
                 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
             self.patch(compiled.end, plus)?;
             self.patch(plus, compiled.start)?;

             let question =
                 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
             let empty = self.add_empty()?;
             self.patch(question, compiled.start)?;
             self.patch(question, empty)?;
             self.patch(plus, empty)?;
             Ok(ThompsonRef { start: question, end: empty })
         } else if n == 1 {
             let compiled = self.c(hir)?;
             let splits =
                 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
             self.patch(compiled.end, splits)?;
             self.patch(splits, compiled.start)?;
             Ok(ThompsonRef { start: compiled.start, end: splits })
         } else {
             let prefix = self.c_exactly(hir, n - 1)?;
             let last = self.c(hir)?;
             let splits =
                 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
             self.patch(prefix.end, last.start)?;
             self.patch(last.end, splits)?;
             self.patch(splits, last.start)?;
             Ok(ThompsonRef { start: prefix.start, end: splits })
         }
     }

     /// Compile the given expression such that it may be matched zero or one
     /// times.
     ///
     /// When `greedy` is true, then the preference is for the expression to
     /// match as much as possible. Otherwise, it will match as little as
     /// possible.
     fn c_zero_or_one(
         &self,
         hir: &Hir,
         greedy: bool,
     ) -> Result<ThompsonRef, Error> {
         let splits =
             self.add(State::Splits { targets: vec![], reverse: !greedy })?;
         let compiled = self.c(hir)?;
         let empty = self.add_empty()?;
         self.patch(splits, compiled.start)?;
         self.patch(splits, empty)?;
         self.patch(compiled.end, empty)?;
         Ok(ThompsonRef { start: splits, end: empty })
     }

     /// Compile the given HIR expression exactly `n` times.
     fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> {
         self.c_concat((0..n).map(|_| self.c(hir)))
     }

     /// Compile the given expression and insert capturing states at the
     /// beginning and end of it. The slot for the capture states is computed
     /// from the index.
     fn c_capture(
         &self,
         index: u32,
         name: Option<&str>,
         hir: &Hir,
     ) -> Result<ThompsonRef, Error> {
         // For discontiguous indices, push placeholders for earlier capture
         // groups that weren't explicitly added. This can happen, for example,
         // with patterns like '(a){0}(a)' where '(a){0}' is completely removed
         // from the pattern.
         let existing_groups_len = self.nfa.borrow().cap_index_to_name.len();
         for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) {
             self.nfa.borrow_mut().cap_index_to_name.push(None);
         }
         if index.as_usize() >= existing_groups_len {
             if let Some(name) = name {
                 let name = Arc::from(name);
                 let mut nfa = self.nfa.borrow_mut();
                 nfa.cap_name_to_index.insert(Arc::clone(&name), index);
                 nfa.cap_index_to_name.push(Some(Arc::clone(&name)));
                 // This is an approximation.
                 nfa.memory_extra += name.len() + size_of::<u32>();
             } else {
                 self.nfa.borrow_mut().cap_index_to_name.push(None);
             }
         }

         let Some(slot) = index.checked_mul(2) else {
             return Err(Error::new("capture group slots exhausted"));
         };
         let start = self.add(State::Capture { target: 0, slot })?;
         let inner = self.c(hir)?;
         let Some(slot) = slot.checked_add(1) else {
             return Err(Error::new("capture group slots exhausted"));
         };
         let end = self.add(State::Capture { target: 0, slot })?;
         self.patch(start, inner.start)?;
         self.patch(inner.end, end)?;

         Ok(ThompsonRef { start, end })
     }

     /// Compile a concatenation of the sub-expressions yielded by the given
     /// iterator. If the iterator yields no elements, then this compiles down
     /// to an "empty" state that always matches.
     fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
     where
         I: Iterator<Item = Result<ThompsonRef, Error>>,
     {
         let ThompsonRef { start, mut end } = match it.next() {
             Some(result) => result?,
             None => return self.c_empty(),
         };
         for result in it {
             let compiled = result?;
             self.patch(end, compiled.start)?;
             end = compiled.end;
         }
         Ok(ThompsonRef { start, end })
     }

     /// Compile an alternation, where each element yielded by the given
     /// iterator represents an item in the alternation. If the iterator yields
     /// no elements, then this compiles down to a "fail" state.
     ///
     /// In an alternation, expressions appearing earlier are "preferred" at
     /// match time over expressions appearing later. (This is currently always
     /// true, as this crate only supports leftmost-first semantics.)
     fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
     where
         I: Iterator<Item = Result<ThompsonRef, Error>>,
     {
         let first = match it.next() {
             None => return self.c_fail(),
             Some(result) => result?,
         };
         let second = match it.next() {
             None => return Ok(first),
             Some(result) => result?,
         };

         let splits =
             self.add(State::Splits { targets: vec![], reverse: false })?;
         let end = self.add_empty()?;
         self.patch(splits, first.start)?;
         self.patch(first.end, end)?;
         self.patch(splits, second.start)?;
         self.patch(second.end, end)?;
         for result in it {
             let compiled = result?;
             self.patch(splits, compiled.start)?;
             self.patch(compiled.end, end)?;
         }
         Ok(ThompsonRef { start: splits, end })
     }

     /// A convenience routine for adding an empty state, also known as an
     /// unconditional epsilon transition. These are quite useful for making
     /// NFA construction simpler.
     ///
     /// (In the regex crate, we do a second pass to remove these, but don't
     /// bother with that here.)
     fn add_empty(&self) -> Result<StateID, Error> {
         self.add(State::Goto { target: 0, look: None })
     }

     /// The common implementation of "add a state." It handles the common
     /// error cases of state ID exhausting (by owning state ID allocation) and
     /// whether the size limit has been exceeded.
     fn add(&self, state: State) -> Result<StateID, Error> {
         let id = u32::try_from(self.nfa.borrow().states.len())
             .map_err(|_| Error::new("exhausted state IDs, too many states"))?;
         self.nfa.borrow_mut().memory_extra += state.memory_usage();
         self.nfa.borrow_mut().states.push(state);
         self.check_size_limit()?;
         Ok(id)
     }

     /// Add a transition from one state to another.
     ///
     /// This routine is called "patch" since it is very common to add the
     /// states you want, typically with "dummy" state ID transitions, and then
     /// "patch" in the real state IDs later. This is because you don't always
     /// know all of the necessary state IDs to add because they might not
     /// exist yet.
     ///
     /// # Errors
     ///
     /// This may error if patching leads to an increase in heap usage beyond
     /// the configured size limit. Heap usage only grows when patching adds a
     /// new transition (as in the case of a "splits" state).
     fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
         let mut new_memory_extra = self.nfa.borrow().memory_extra;
         match self.nfa.borrow_mut().states[from.as_usize()] {
             State::Char { ref mut target, .. } => {
                 *target = to;
             }
             State::Ranges { ref mut target, .. } => {
                 *target = to;
             }
             State::Splits { ref mut targets, .. } => {
                 targets.push(to);
                 new_memory_extra += size_of::<StateID>();
             }
             State::Goto { ref mut target, .. } => {
                 *target = to;
             }
             State::Capture { ref mut target, .. } => {
                 *target = to;
             }
             State::Fail | State::Match => {}
         }
         if new_memory_extra != self.nfa.borrow().memory_extra {
             self.nfa.borrow_mut().memory_extra = new_memory_extra;
             self.check_size_limit()?;
         }
         Ok(())
     }

     /// Checks that the current heap memory usage of the NFA being compiled
     /// doesn't exceed the configured size limit. If it does, an error is
     /// returned.
     fn check_size_limit(&self) -> Result<(), Error> {
         if let Some(limit) = self.config.size_limit {
             if self.nfa.borrow().memory_usage() > limit {
                 return Err(Error::new("compiled regex exceeded size limit"));
             }
         }
         Ok(())
     }
 }

 /// A value that represents the result of compiling a sub-expression of a
 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
 /// has an initial state at `start` and a final state at `end`.
 #[derive(Clone, Copy, Debug)]
 struct ThompsonRef {
     start: StateID,
     end: StateID,
 }