| use core::{cell::RefCell, mem::size_of}; |
| |
| use alloc::{string::String, sync::Arc, vec, vec::Vec}; |
| |
| use crate::{ |
| error::Error, |
| hir::{self, Hir, HirKind}, |
| int::U32, |
| }; |
| |
| pub(crate) type StateID = u32; |
| |
| #[derive(Clone, Copy, Debug)] |
| pub(crate) struct Config { |
| pub(crate) size_limit: Option<usize>, |
| } |
| |
| impl Default for Config { |
| fn default() -> Config { |
| Config { size_limit: Some(10 * (1 << 20)) } |
| } |
| } |
| |
| #[derive(Clone)] |
| pub(crate) struct NFA { |
| /// The pattern string this NFA was generated from. |
| /// |
| /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ |
| pattern: String, |
| /// The states that make up this NFA. |
| states: Vec<State>, |
| /// The ID of the start state. |
| start: StateID, |
| /// Whether this NFA can only match at the beginning of a haystack. |
| is_start_anchored: bool, |
| /// Whether this NFA can match the empty string. |
| is_match_empty: bool, |
| /// If every match has the same number of matching capture groups, then |
| /// this corresponds to the number of groups. |
| static_explicit_captures_len: Option<usize>, |
| /// A map from capture group name to its corresponding index. |
| cap_name_to_index: CaptureNameMap, |
| /// A map from capture group index to the corresponding name, if one |
| /// exists. |
| cap_index_to_name: Vec<Option<Arc<str>>>, |
| /// Heap memory used indirectly by NFA states and other things (like the |
| /// various capturing group representations above). Since each state |
| /// might use a different amount of heap, we need to keep track of this |
| /// incrementally. |
| memory_extra: usize, |
| } |
| |
| impl NFA { |
| /// Creates a new NFA from the given configuration and HIR. |
| pub(crate) fn new( |
| config: Config, |
| pattern: String, |
| hir: &Hir, |
| ) -> Result<NFA, Error> { |
| Compiler::new(config, pattern).compile(hir) |
| } |
| |
| /// Returns the pattern string used to construct this NFA. |
| pub(crate) fn pattern(&self) -> &str { |
| &self.pattern |
| } |
| |
| /// Returns the state corresponding to the given ID. |
| /// |
| /// # Panics |
| /// |
| /// If the ID does not refer to a valid state, then this panics. |
| pub(crate) fn state(&self, id: StateID) -> &State { |
| &self.states[id.as_usize()] |
| } |
| |
| /// Returns the total number of states in this NFA. |
| pub(crate) fn len(&self) -> usize { |
| self.states.len() |
| } |
| |
| /// Returns the ID of the starting state for this NFA. |
| pub(crate) fn start(&self) -> StateID { |
| self.start |
| } |
| |
| /// Returns the capture group index for the corresponding named group. |
| /// If no such group with the given name exists, then `None` is returned. |
| pub(crate) fn to_index(&self, name: &str) -> Option<usize> { |
| self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) |
| } |
| |
| /* |
| /// Returns the capture group name for the corresponding index. |
| /// If no such group with the given index, then `None` is returned. |
| pub(crate) fn to_name(&self, index: usize) -> Option<&str> { |
| self.cap_index_to_name.get(index)?.as_deref() |
| } |
| */ |
| |
| /// Returns an iterator over all of the capture groups, along with their |
| /// names if they exist, in this NFA. |
| pub(crate) fn capture_names(&self) -> CaptureNames<'_> { |
| CaptureNames { it: self.cap_index_to_name.iter() } |
| } |
| |
| /// Returns the total number of capture groups, including the first and |
| /// implicit group, in this NFA. |
| pub(crate) fn group_len(&self) -> usize { |
| self.cap_index_to_name.len() |
| } |
| |
| /// Returns true if and only if this NFA can only match at the beginning of |
| /// a haystack. |
| pub(crate) fn is_start_anchored(&self) -> bool { |
| self.is_start_anchored |
| } |
| |
| /// If the pattern always reports the same number of matching capture groups |
| /// for every match, then this returns the number of those groups. This |
| /// doesn't include the implicit group found in every pattern. |
| pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
| self.static_explicit_captures_len |
| } |
| |
| /// Returns the heap memory usage, in bytes, used by this NFA. |
| fn memory_usage(&self) -> usize { |
| (self.states.len() * size_of::<State>()) |
| + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>()) |
| + self.memory_extra |
| } |
| } |
| |
| impl core::fmt::Debug for NFA { |
| fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| writeln!(f, "NFA(")?; |
| writeln!(f, "pattern: {}", self.pattern)?; |
| for (sid, state) in self.states.iter().enumerate() { |
| writeln!(f, "{:07?}: {:?}", sid, state)?; |
| } |
| writeln!(f, ")")?; |
| Ok(()) |
| } |
| } |
| |
| /// An iterator over all capture groups in an NFA. |
| /// |
| /// If a particular group has a name, then it is yielded. Otherwise, `None` |
| /// is yielded. |
| #[derive(Clone, Debug)] |
| pub(crate) struct CaptureNames<'a> { |
| it: core::slice::Iter<'a, Option<Arc<str>>>, |
| } |
| |
| impl<'a> Iterator for CaptureNames<'a> { |
| type Item = Option<&'a str>; |
| |
| fn next(&mut self) -> Option<Option<&'a str>> { |
| self.it.next().map(|n| n.as_deref()) |
| } |
| } |
| |
| #[derive(Clone, Eq, PartialEq)] |
| pub(crate) enum State { |
| Char { target: StateID, ch: char }, |
| Ranges { target: StateID, ranges: Vec<(char, char)> }, |
| Splits { targets: Vec<StateID>, reverse: bool }, |
| Goto { target: StateID, look: Option<hir::Look> }, |
| Capture { target: StateID, slot: u32 }, |
| Fail, |
| Match, |
| } |
| |
| impl State { |
| /// Returns the heap memory usage of this NFA state in bytes. |
| fn memory_usage(&self) -> usize { |
| match *self { |
| State::Char { .. } |
| | State::Goto { .. } |
| | State::Capture { .. } |
| | State::Fail { .. } |
| | State::Match => 0, |
| State::Splits { ref targets, .. } => { |
| targets.len() * size_of::<StateID>() |
| } |
| State::Ranges { ref ranges, .. } => { |
| ranges.len() * size_of::<(char, char)>() |
| } |
| } |
| } |
| |
| /// Returns an iterator over the given split targets. The order of the |
| /// iterator yields elements in reverse when `reverse` is true. |
| pub(crate) fn iter_splits<'a>( |
| splits: &'a [StateID], |
| reverse: bool, |
| ) -> impl Iterator<Item = StateID> + 'a { |
| let mut it = splits.iter(); |
| core::iter::from_fn(move || { |
| if reverse { it.next_back() } else { it.next() }.copied() |
| }) |
| } |
| } |
| |
| impl core::fmt::Debug for State { |
| fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| match *self { |
| State::Char { target, ch } => { |
| write!(f, "{:?} => {:?}", ch, target) |
| } |
| State::Ranges { target, ref ranges } => { |
| for (i, &(start, end)) in ranges.iter().enumerate() { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{:?}-{:?} => {:?}", start, end, target)?; |
| } |
| Ok(()) |
| } |
| State::Splits { ref targets, reverse } => { |
| write!(f, "splits(")?; |
| for (i, sid) in |
| State::iter_splits(targets, reverse).enumerate() |
| { |
| if i > 0 { |
| write!(f, ", ")?; |
| } |
| write!(f, "{:?}", sid)?; |
| } |
| write!(f, ")") |
| } |
| State::Goto { target, look: None } => { |
| write!(f, "goto({:?})", target) |
| } |
| State::Goto { target, look: Some(look) } => { |
| write!(f, "{:?} => {:?}", look, target) |
| } |
| State::Capture { target, slot } => { |
| write!(f, "capture(slot={:?}) => {:?}", slot, target,) |
| } |
| State::Fail => write!(f, "FAIL"), |
| State::Match => { |
| write!(f, "MATCH") |
| } |
| } |
| } |
| } |
| |
| /// A map from capture group name to its corresponding capture group index. |
| /// |
| /// We define a type alias here so that we can transparently use a `HashMap` |
| /// whenever it's available. We do so presumably because it's faster, although |
| /// there are no benchmarks verifying this. |
| #[cfg(feature = "std")] |
| type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>; |
| #[cfg(not(feature = "std"))] |
| type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>; |
| |
| #[derive(Debug)] |
| struct Compiler { |
| config: Config, |
| nfa: RefCell<NFA>, |
| } |
| |
| impl Compiler { |
| fn new(config: Config, pattern: String) -> Compiler { |
| let nfa = RefCell::new(NFA { |
| pattern, |
| states: vec![], |
| start: 0, |
| is_start_anchored: false, |
| is_match_empty: false, |
| static_explicit_captures_len: None, |
| cap_name_to_index: CaptureNameMap::default(), |
| cap_index_to_name: vec![], |
| memory_extra: 0, |
| }); |
| Compiler { config, nfa } |
| } |
| |
| fn compile(self, hir: &Hir) -> Result<NFA, Error> { |
| self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); |
| self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); |
| self.nfa.borrow_mut().static_explicit_captures_len = |
| hir.static_explicit_captures_len(); |
| let compiled = self.c_capture(0, None, hir)?; |
| let mat = self.add(State::Match)?; |
| self.patch(compiled.end, mat)?; |
| self.nfa.borrow_mut().start = compiled.start; |
| Ok(self.nfa.into_inner()) |
| } |
| |
| fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> { |
| match *hir.kind() { |
| HirKind::Empty => self.c_empty(), |
| HirKind::Char(ch) => self.c_char(ch), |
| HirKind::Class(ref class) => self.c_class(class), |
| HirKind::Look(ref look) => self.c_look(look), |
| HirKind::Repetition(ref rep) => self.c_repetition(rep), |
| HirKind::Capture(ref cap) => { |
| self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) |
| } |
| HirKind::Concat(ref subs) => { |
| self.c_concat(subs.iter().map(|s| self.c(s))) |
| } |
| HirKind::Alternation(ref subs) => { |
| self.c_alternation(subs.iter().map(|s| self.c(s))) |
| } |
| } |
| } |
| |
| /// Compile a "fail" state that can never be transitioned out of. |
| fn c_fail(&self) -> Result<ThompsonRef, Error> { |
| let id = self.add(State::Fail)?; |
| Ok(ThompsonRef { start: id, end: id }) |
| } |
| |
| /// Compile an "empty" state with one unconditional epsilon transition. |
| /// |
| /// Both the `start` and `end` locations point to the state created. |
| /// Callers will likely want to keep the `start`, but patch the `end` to |
| /// point to some other state. |
| fn c_empty(&self) -> Result<ThompsonRef, Error> { |
| let id = self.add_empty()?; |
| Ok(ThompsonRef { start: id, end: id }) |
| } |
| |
| /// Compile the given literal char to an NFA. |
| fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { |
| let id = self.add(State::Char { target: 0, ch })?; |
| Ok(ThompsonRef { start: id, end: id }) |
| } |
| |
| /// Compile the given character class into an NFA. |
| /// |
| /// If the class is empty, then this compiles to a `Fail` state. |
| fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> { |
| let id = if class.ranges.is_empty() { |
| // Technically using an explicit fail state probably isn't |
| // necessary. Because if you try to match against an empty Ranges, |
| // then it should turn up with nothing regardless of input, and |
| // thus "acts" like a Fail state. But it's better to be more |
| // explicit, and there's no real cost to doing so. |
| self.add(State::Fail) |
| } else { |
| let ranges = |
| class.ranges.iter().map(|r| (r.start, r.end)).collect(); |
| self.add(State::Ranges { target: 0, ranges }) |
| }?; |
| Ok(ThompsonRef { start: id, end: id }) |
| } |
| |
| /// Compile the given HIR look-around assertion to an NFA look-around |
| /// assertion. |
| fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> { |
| let id = self.add(State::Goto { target: 0, look: Some(*look) })?; |
| Ok(ThompsonRef { start: id, end: id }) |
| } |
| |
| /// Compile the given repetition expression. This handles all types of |
| /// repetitions and greediness. |
| fn c_repetition( |
| &self, |
| rep: &hir::Repetition, |
| ) -> Result<ThompsonRef, Error> { |
| match (rep.min, rep.max) { |
| (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
| (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
| (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
| (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
| } |
| } |
| |
| /// Compile the given expression such that it matches at least `min` times, |
| /// but no more than `max` times. |
| /// |
| /// When `greedy` is true, then the preference is for the expression to |
| /// match as much as possible. Otherwise, it will match as little as |
| /// possible. |
| fn c_bounded( |
| &self, |
| hir: &Hir, |
| greedy: bool, |
| min: u32, |
| max: u32, |
| ) -> Result<ThompsonRef, Error> { |
| let prefix = self.c_exactly(hir, min)?; |
| if min == max { |
| return Ok(prefix); |
| } |
| |
| // It is tempting here to compile the rest here as a concatenation |
| // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
| // were `aaa?a?a?`. The problem here is that it leads to this program: |
| // |
| // >000000: 61 => 01 |
| // 000001: 61 => 02 |
| // 000002: union(03, 04) |
| // 000003: 61 => 04 |
| // 000004: union(05, 06) |
| // 000005: 61 => 06 |
| // 000006: union(07, 08) |
| // 000007: 61 => 08 |
| // 000008: MATCH |
| // |
| // And effectively, once you hit state 2, the epsilon closure will |
| // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
| // to instead compile it like so: |
| // |
| // >000000: 61 => 01 |
| // 000001: 61 => 02 |
| // 000002: union(03, 08) |
| // 000003: 61 => 04 |
| // 000004: union(05, 08) |
| // 000005: 61 => 06 |
| // 000006: union(07, 08) |
| // 000007: 61 => 08 |
| // 000008: MATCH |
| // |
| // So that the epsilon closure of state 2 is now just 3 and 8. |
| let empty = self.add_empty()?; |
| let mut prev_end = prefix.end; |
| for _ in min..max { |
| let splits = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| let compiled = self.c(hir)?; |
| self.patch(prev_end, splits)?; |
| self.patch(splits, compiled.start)?; |
| self.patch(splits, empty)?; |
| prev_end = compiled.end; |
| } |
| self.patch(prev_end, empty)?; |
| Ok(ThompsonRef { start: prefix.start, end: empty }) |
| } |
| |
| /// Compile the given expression such that it may be matched `n` or more |
| /// times, where `n` can be any integer. (Although a particularly large |
| /// integer is likely to run afoul of any configured size limits.) |
| /// |
| /// When `greedy` is true, then the preference is for the expression to |
| /// match as much as possible. Otherwise, it will match as little as |
| /// possible. |
| fn c_at_least( |
| &self, |
| hir: &Hir, |
| greedy: bool, |
| n: u32, |
| ) -> Result<ThompsonRef, Error> { |
| if n == 0 { |
| // When the expression cannot match the empty string, then we |
| // can get away with something much simpler: just one 'alt' |
| // instruction that optionally repeats itself. But if the expr |
| // can match the empty string... see below. |
| if !hir.is_match_empty() { |
| let splits = self.add(State::Splits { |
| targets: vec![], |
| reverse: !greedy, |
| })?; |
| let compiled = self.c(hir)?; |
| self.patch(splits, compiled.start)?; |
| self.patch(compiled.end, splits)?; |
| return Ok(ThompsonRef { start: splits, end: splits }); |
| } |
| |
| // What's going on here? Shouldn't x* be simpler than this? It |
| // turns out that when implementing leftmost-first (Perl-like) |
| // match semantics, x* results in an incorrect preference order |
| // when computing the transitive closure of states if and only if |
| // 'x' can match the empty string. So instead, we compile x* as |
| // (x+)?, which preserves the correct preference order. |
| // |
| // See: https://github.com/rust-lang/regex/issues/779 |
| let compiled = self.c(hir)?; |
| let plus = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| self.patch(compiled.end, plus)?; |
| self.patch(plus, compiled.start)?; |
| |
| let question = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| let empty = self.add_empty()?; |
| self.patch(question, compiled.start)?; |
| self.patch(question, empty)?; |
| self.patch(plus, empty)?; |
| Ok(ThompsonRef { start: question, end: empty }) |
| } else if n == 1 { |
| let compiled = self.c(hir)?; |
| let splits = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| self.patch(compiled.end, splits)?; |
| self.patch(splits, compiled.start)?; |
| Ok(ThompsonRef { start: compiled.start, end: splits }) |
| } else { |
| let prefix = self.c_exactly(hir, n - 1)?; |
| let last = self.c(hir)?; |
| let splits = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| self.patch(prefix.end, last.start)?; |
| self.patch(last.end, splits)?; |
| self.patch(splits, last.start)?; |
| Ok(ThompsonRef { start: prefix.start, end: splits }) |
| } |
| } |
| |
| /// Compile the given expression such that it may be matched zero or one |
| /// times. |
| /// |
| /// When `greedy` is true, then the preference is for the expression to |
| /// match as much as possible. Otherwise, it will match as little as |
| /// possible. |
| fn c_zero_or_one( |
| &self, |
| hir: &Hir, |
| greedy: bool, |
| ) -> Result<ThompsonRef, Error> { |
| let splits = |
| self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| let compiled = self.c(hir)?; |
| let empty = self.add_empty()?; |
| self.patch(splits, compiled.start)?; |
| self.patch(splits, empty)?; |
| self.patch(compiled.end, empty)?; |
| Ok(ThompsonRef { start: splits, end: empty }) |
| } |
| |
| /// Compile the given HIR expression exactly `n` times. |
| fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> { |
| self.c_concat((0..n).map(|_| self.c(hir))) |
| } |
| |
| /// Compile the given expression and insert capturing states at the |
| /// beginning and end of it. The slot for the capture states is computed |
| /// from the index. |
| fn c_capture( |
| &self, |
| index: u32, |
| name: Option<&str>, |
| hir: &Hir, |
| ) -> Result<ThompsonRef, Error> { |
| // For discontiguous indices, push placeholders for earlier capture |
| // groups that weren't explicitly added. This can happen, for example, |
| // with patterns like '(a){0}(a)' where '(a){0}' is completely removed |
| // from the pattern. |
| let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); |
| for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) { |
| self.nfa.borrow_mut().cap_index_to_name.push(None); |
| } |
| if index.as_usize() >= existing_groups_len { |
| if let Some(name) = name { |
| let name = Arc::from(name); |
| let mut nfa = self.nfa.borrow_mut(); |
| nfa.cap_name_to_index.insert(Arc::clone(&name), index); |
| nfa.cap_index_to_name.push(Some(Arc::clone(&name))); |
| // This is an approximation. |
| nfa.memory_extra += name.len() + size_of::<u32>(); |
| } else { |
| self.nfa.borrow_mut().cap_index_to_name.push(None); |
| } |
| } |
| |
| let Some(slot) = index.checked_mul(2) else { |
| return Err(Error::new("capture group slots exhausted")); |
| }; |
| let start = self.add(State::Capture { target: 0, slot })?; |
| let inner = self.c(hir)?; |
| let Some(slot) = slot.checked_add(1) else { |
| return Err(Error::new("capture group slots exhausted")); |
| }; |
| let end = self.add(State::Capture { target: 0, slot })?; |
| self.patch(start, inner.start)?; |
| self.patch(inner.end, end)?; |
| |
| Ok(ThompsonRef { start, end }) |
| } |
| |
| /// Compile a concatenation of the sub-expressions yielded by the given |
| /// iterator. If the iterator yields no elements, then this compiles down |
| /// to an "empty" state that always matches. |
| fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
| where |
| I: Iterator<Item = Result<ThompsonRef, Error>>, |
| { |
| let ThompsonRef { start, mut end } = match it.next() { |
| Some(result) => result?, |
| None => return self.c_empty(), |
| }; |
| for result in it { |
| let compiled = result?; |
| self.patch(end, compiled.start)?; |
| end = compiled.end; |
| } |
| Ok(ThompsonRef { start, end }) |
| } |
| |
| /// Compile an alternation, where each element yielded by the given |
| /// iterator represents an item in the alternation. If the iterator yields |
| /// no elements, then this compiles down to a "fail" state. |
| /// |
| /// In an alternation, expressions appearing earlier are "preferred" at |
| /// match time over expressions appearing later. (This is currently always |
| /// true, as this crate only supports leftmost-first semantics.) |
| fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
| where |
| I: Iterator<Item = Result<ThompsonRef, Error>>, |
| { |
| let first = match it.next() { |
| None => return self.c_fail(), |
| Some(result) => result?, |
| }; |
| let second = match it.next() { |
| None => return Ok(first), |
| Some(result) => result?, |
| }; |
| |
| let splits = |
| self.add(State::Splits { targets: vec![], reverse: false })?; |
| let end = self.add_empty()?; |
| self.patch(splits, first.start)?; |
| self.patch(first.end, end)?; |
| self.patch(splits, second.start)?; |
| self.patch(second.end, end)?; |
| for result in it { |
| let compiled = result?; |
| self.patch(splits, compiled.start)?; |
| self.patch(compiled.end, end)?; |
| } |
| Ok(ThompsonRef { start: splits, end }) |
| } |
| |
| /// A convenience routine for adding an empty state, also known as an |
| /// unconditional epsilon transition. These are quite useful for making |
| /// NFA construction simpler. |
| /// |
| /// (In the regex crate, we do a second pass to remove these, but don't |
| /// bother with that here.) |
| fn add_empty(&self) -> Result<StateID, Error> { |
| self.add(State::Goto { target: 0, look: None }) |
| } |
| |
| /// The common implementation of "add a state." It handles the common |
| /// error cases of state ID exhausting (by owning state ID allocation) and |
| /// whether the size limit has been exceeded. |
| fn add(&self, state: State) -> Result<StateID, Error> { |
| let id = u32::try_from(self.nfa.borrow().states.len()) |
| .map_err(|_| Error::new("exhausted state IDs, too many states"))?; |
| self.nfa.borrow_mut().memory_extra += state.memory_usage(); |
| self.nfa.borrow_mut().states.push(state); |
| self.check_size_limit()?; |
| Ok(id) |
| } |
| |
| /// Add a transition from one state to another. |
| /// |
| /// This routine is called "patch" since it is very common to add the |
| /// states you want, typically with "dummy" state ID transitions, and then |
| /// "patch" in the real state IDs later. This is because you don't always |
| /// know all of the necessary state IDs to add because they might not |
| /// exist yet. |
| /// |
| /// # Errors |
| /// |
| /// This may error if patching leads to an increase in heap usage beyond |
| /// the configured size limit. Heap usage only grows when patching adds a |
| /// new transition (as in the case of a "splits" state). |
| fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { |
| let mut new_memory_extra = self.nfa.borrow().memory_extra; |
| match self.nfa.borrow_mut().states[from.as_usize()] { |
| State::Char { ref mut target, .. } => { |
| *target = to; |
| } |
| State::Ranges { ref mut target, .. } => { |
| *target = to; |
| } |
| State::Splits { ref mut targets, .. } => { |
| targets.push(to); |
| new_memory_extra += size_of::<StateID>(); |
| } |
| State::Goto { ref mut target, .. } => { |
| *target = to; |
| } |
| State::Capture { ref mut target, .. } => { |
| *target = to; |
| } |
| State::Fail | State::Match => {} |
| } |
| if new_memory_extra != self.nfa.borrow().memory_extra { |
| self.nfa.borrow_mut().memory_extra = new_memory_extra; |
| self.check_size_limit()?; |
| } |
| Ok(()) |
| } |
| |
| /// Checks that the current heap memory usage of the NFA being compiled |
| /// doesn't exceed the configured size limit. If it does, an error is |
| /// returned. |
| fn check_size_limit(&self) -> Result<(), Error> { |
| if let Some(limit) = self.config.size_limit { |
| if self.nfa.borrow().memory_usage() > limit { |
| return Err(Error::new("compiled regex exceeded size limit")); |
| } |
| } |
| Ok(()) |
| } |
| } |
| |
| /// A value that represents the result of compiling a sub-expression of a |
| /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
| /// has an initial state at `start` and a final state at `end`. |
| #[derive(Clone, Copy, Debug)] |
| struct ThompsonRef { |
| start: StateID, |
| end: StateID, |
| } |