blob: 8f37a5451269439ca69c348196e4c588b7ee8033 [file] [log] [blame] [edit]
use core::{cell::RefCell, mem::size_of};
use alloc::{string::String, sync::Arc, vec, vec::Vec};
use crate::{
error::Error,
hir::{self, Hir, HirKind},
int::U32,
};
pub(crate) type StateID = u32;
#[derive(Clone, Copy, Debug)]
pub(crate) struct Config {
pub(crate) size_limit: Option<usize>,
}
impl Default for Config {
fn default() -> Config {
Config { size_limit: Some(10 * (1 << 20)) }
}
}
#[derive(Clone)]
pub(crate) struct NFA {
/// The pattern string this NFA was generated from.
///
/// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯
pattern: String,
/// The states that make up this NFA.
states: Vec<State>,
/// The ID of the start state.
start: StateID,
/// Whether this NFA can only match at the beginning of a haystack.
is_start_anchored: bool,
/// Whether this NFA can match the empty string.
is_match_empty: bool,
/// If every match has the same number of matching capture groups, then
/// this corresponds to the number of groups.
static_explicit_captures_len: Option<usize>,
/// A map from capture group name to its corresponding index.
cap_name_to_index: CaptureNameMap,
/// A map from capture group index to the corresponding name, if one
/// exists.
cap_index_to_name: Vec<Option<Arc<str>>>,
/// Heap memory used indirectly by NFA states and other things (like the
/// various capturing group representations above). Since each state
/// might use a different amount of heap, we need to keep track of this
/// incrementally.
memory_extra: usize,
}
impl NFA {
/// Creates a new NFA from the given configuration and HIR.
pub(crate) fn new(
config: Config,
pattern: String,
hir: &Hir,
) -> Result<NFA, Error> {
Compiler::new(config, pattern).compile(hir)
}
/// Returns the pattern string used to construct this NFA.
pub(crate) fn pattern(&self) -> &str {
&self.pattern
}
/// Returns the state corresponding to the given ID.
///
/// # Panics
///
/// If the ID does not refer to a valid state, then this panics.
pub(crate) fn state(&self, id: StateID) -> &State {
&self.states[id.as_usize()]
}
/// Returns the total number of states in this NFA.
pub(crate) fn len(&self) -> usize {
self.states.len()
}
/// Returns the ID of the starting state for this NFA.
pub(crate) fn start(&self) -> StateID {
self.start
}
/// Returns the capture group index for the corresponding named group.
/// If no such group with the given name exists, then `None` is returned.
pub(crate) fn to_index(&self, name: &str) -> Option<usize> {
self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize())
}
/*
/// Returns the capture group name for the corresponding index.
/// If no such group with the given index, then `None` is returned.
pub(crate) fn to_name(&self, index: usize) -> Option<&str> {
self.cap_index_to_name.get(index)?.as_deref()
}
*/
/// Returns an iterator over all of the capture groups, along with their
/// names if they exist, in this NFA.
pub(crate) fn capture_names(&self) -> CaptureNames<'_> {
CaptureNames { it: self.cap_index_to_name.iter() }
}
/// Returns the total number of capture groups, including the first and
/// implicit group, in this NFA.
pub(crate) fn group_len(&self) -> usize {
self.cap_index_to_name.len()
}
/// Returns true if and only if this NFA can only match at the beginning of
/// a haystack.
pub(crate) fn is_start_anchored(&self) -> bool {
self.is_start_anchored
}
/// If the pattern always reports the same number of matching capture groups
/// for every match, then this returns the number of those groups. This
/// doesn't include the implicit group found in every pattern.
pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
self.static_explicit_captures_len
}
/// Returns the heap memory usage, in bytes, used by this NFA.
fn memory_usage(&self) -> usize {
(self.states.len() * size_of::<State>())
+ (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>())
+ self.memory_extra
}
}
impl core::fmt::Debug for NFA {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
writeln!(f, "NFA(")?;
writeln!(f, "pattern: {}", self.pattern)?;
for (sid, state) in self.states.iter().enumerate() {
writeln!(f, "{:07?}: {:?}", sid, state)?;
}
writeln!(f, ")")?;
Ok(())
}
}
/// An iterator over all capture groups in an NFA.
///
/// If a particular group has a name, then it is yielded. Otherwise, `None`
/// is yielded.
#[derive(Clone, Debug)]
pub(crate) struct CaptureNames<'a> {
it: core::slice::Iter<'a, Option<Arc<str>>>,
}
impl<'a> Iterator for CaptureNames<'a> {
type Item = Option<&'a str>;
fn next(&mut self) -> Option<Option<&'a str>> {
self.it.next().map(|n| n.as_deref())
}
}
#[derive(Clone, Eq, PartialEq)]
pub(crate) enum State {
Char { target: StateID, ch: char },
Ranges { target: StateID, ranges: Vec<(char, char)> },
Splits { targets: Vec<StateID>, reverse: bool },
Goto { target: StateID, look: Option<hir::Look> },
Capture { target: StateID, slot: u32 },
Fail,
Match,
}
impl State {
/// Returns the heap memory usage of this NFA state in bytes.
fn memory_usage(&self) -> usize {
match *self {
State::Char { .. }
| State::Goto { .. }
| State::Capture { .. }
| State::Fail { .. }
| State::Match => 0,
State::Splits { ref targets, .. } => {
targets.len() * size_of::<StateID>()
}
State::Ranges { ref ranges, .. } => {
ranges.len() * size_of::<(char, char)>()
}
}
}
/// Returns an iterator over the given split targets. The order of the
/// iterator yields elements in reverse when `reverse` is true.
pub(crate) fn iter_splits<'a>(
splits: &'a [StateID],
reverse: bool,
) -> impl Iterator<Item = StateID> + 'a {
let mut it = splits.iter();
core::iter::from_fn(move || {
if reverse { it.next_back() } else { it.next() }.copied()
})
}
}
impl core::fmt::Debug for State {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
State::Char { target, ch } => {
write!(f, "{:?} => {:?}", ch, target)
}
State::Ranges { target, ref ranges } => {
for (i, &(start, end)) in ranges.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{:?}-{:?} => {:?}", start, end, target)?;
}
Ok(())
}
State::Splits { ref targets, reverse } => {
write!(f, "splits(")?;
for (i, sid) in
State::iter_splits(targets, reverse).enumerate()
{
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{:?}", sid)?;
}
write!(f, ")")
}
State::Goto { target, look: None } => {
write!(f, "goto({:?})", target)
}
State::Goto { target, look: Some(look) } => {
write!(f, "{:?} => {:?}", look, target)
}
State::Capture { target, slot } => {
write!(f, "capture(slot={:?}) => {:?}", slot, target,)
}
State::Fail => write!(f, "FAIL"),
State::Match => {
write!(f, "MATCH")
}
}
}
}
/// A map from capture group name to its corresponding capture group index.
///
/// We define a type alias here so that we can transparently use a `HashMap`
/// whenever it's available. We do so presumably because it's faster, although
/// there are no benchmarks verifying this.
#[cfg(feature = "std")]
type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>;
#[cfg(not(feature = "std"))]
type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>;
#[derive(Debug)]
struct Compiler {
config: Config,
nfa: RefCell<NFA>,
}
impl Compiler {
fn new(config: Config, pattern: String) -> Compiler {
let nfa = RefCell::new(NFA {
pattern,
states: vec![],
start: 0,
is_start_anchored: false,
is_match_empty: false,
static_explicit_captures_len: None,
cap_name_to_index: CaptureNameMap::default(),
cap_index_to_name: vec![],
memory_extra: 0,
});
Compiler { config, nfa }
}
fn compile(self, hir: &Hir) -> Result<NFA, Error> {
self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored();
self.nfa.borrow_mut().is_match_empty = hir.is_match_empty();
self.nfa.borrow_mut().static_explicit_captures_len =
hir.static_explicit_captures_len();
let compiled = self.c_capture(0, None, hir)?;
let mat = self.add(State::Match)?;
self.patch(compiled.end, mat)?;
self.nfa.borrow_mut().start = compiled.start;
Ok(self.nfa.into_inner())
}
fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> {
match *hir.kind() {
HirKind::Empty => self.c_empty(),
HirKind::Char(ch) => self.c_char(ch),
HirKind::Class(ref class) => self.c_class(class),
HirKind::Look(ref look) => self.c_look(look),
HirKind::Repetition(ref rep) => self.c_repetition(rep),
HirKind::Capture(ref cap) => {
self.c_capture(cap.index, cap.name.as_deref(), &cap.sub)
}
HirKind::Concat(ref subs) => {
self.c_concat(subs.iter().map(|s| self.c(s)))
}
HirKind::Alternation(ref subs) => {
self.c_alternation(subs.iter().map(|s| self.c(s)))
}
}
}
/// Compile a "fail" state that can never be transitioned out of.
fn c_fail(&self) -> Result<ThompsonRef, Error> {
let id = self.add(State::Fail)?;
Ok(ThompsonRef { start: id, end: id })
}
/// Compile an "empty" state with one unconditional epsilon transition.
///
/// Both the `start` and `end` locations point to the state created.
/// Callers will likely want to keep the `start`, but patch the `end` to
/// point to some other state.
fn c_empty(&self) -> Result<ThompsonRef, Error> {
let id = self.add_empty()?;
Ok(ThompsonRef { start: id, end: id })
}
/// Compile the given literal char to an NFA.
fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
let id = self.add(State::Char { target: 0, ch })?;
Ok(ThompsonRef { start: id, end: id })
}
/// Compile the given character class into an NFA.
///
/// If the class is empty, then this compiles to a `Fail` state.
fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> {
let id = if class.ranges.is_empty() {
// Technically using an explicit fail state probably isn't
// necessary. Because if you try to match against an empty Ranges,
// then it should turn up with nothing regardless of input, and
// thus "acts" like a Fail state. But it's better to be more
// explicit, and there's no real cost to doing so.
self.add(State::Fail)
} else {
let ranges =
class.ranges.iter().map(|r| (r.start, r.end)).collect();
self.add(State::Ranges { target: 0, ranges })
}?;
Ok(ThompsonRef { start: id, end: id })
}
/// Compile the given HIR look-around assertion to an NFA look-around
/// assertion.
fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> {
let id = self.add(State::Goto { target: 0, look: Some(*look) })?;
Ok(ThompsonRef { start: id, end: id })
}
/// Compile the given repetition expression. This handles all types of
/// repetitions and greediness.
fn c_repetition(
&self,
rep: &hir::Repetition,
) -> Result<ThompsonRef, Error> {
match (rep.min, rep.max) {
(0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
(min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
(min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
(min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
}
}
/// Compile the given expression such that it matches at least `min` times,
/// but no more than `max` times.
///
/// When `greedy` is true, then the preference is for the expression to
/// match as much as possible. Otherwise, it will match as little as
/// possible.
fn c_bounded(
&self,
hir: &Hir,
greedy: bool,
min: u32,
max: u32,
) -> Result<ThompsonRef, Error> {
let prefix = self.c_exactly(hir, min)?;
if min == max {
return Ok(prefix);
}
// It is tempting here to compile the rest here as a concatenation
// of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
// were `aaa?a?a?`. The problem here is that it leads to this program:
//
// >000000: 61 => 01
// 000001: 61 => 02
// 000002: union(03, 04)
// 000003: 61 => 04
// 000004: union(05, 06)
// 000005: 61 => 06
// 000006: union(07, 08)
// 000007: 61 => 08
// 000008: MATCH
//
// And effectively, once you hit state 2, the epsilon closure will
// include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
// to instead compile it like so:
//
// >000000: 61 => 01
// 000001: 61 => 02
// 000002: union(03, 08)
// 000003: 61 => 04
// 000004: union(05, 08)
// 000005: 61 => 06
// 000006: union(07, 08)
// 000007: 61 => 08
// 000008: MATCH
//
// So that the epsilon closure of state 2 is now just 3 and 8.
let empty = self.add_empty()?;
let mut prev_end = prefix.end;
for _ in min..max {
let splits =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
let compiled = self.c(hir)?;
self.patch(prev_end, splits)?;
self.patch(splits, compiled.start)?;
self.patch(splits, empty)?;
prev_end = compiled.end;
}
self.patch(prev_end, empty)?;
Ok(ThompsonRef { start: prefix.start, end: empty })
}
/// Compile the given expression such that it may be matched `n` or more
/// times, where `n` can be any integer. (Although a particularly large
/// integer is likely to run afoul of any configured size limits.)
///
/// When `greedy` is true, then the preference is for the expression to
/// match as much as possible. Otherwise, it will match as little as
/// possible.
fn c_at_least(
&self,
hir: &Hir,
greedy: bool,
n: u32,
) -> Result<ThompsonRef, Error> {
if n == 0 {
// When the expression cannot match the empty string, then we
// can get away with something much simpler: just one 'alt'
// instruction that optionally repeats itself. But if the expr
// can match the empty string... see below.
if !hir.is_match_empty() {
let splits = self.add(State::Splits {
targets: vec![],
reverse: !greedy,
})?;
let compiled = self.c(hir)?;
self.patch(splits, compiled.start)?;
self.patch(compiled.end, splits)?;
return Ok(ThompsonRef { start: splits, end: splits });
}
// What's going on here? Shouldn't x* be simpler than this? It
// turns out that when implementing leftmost-first (Perl-like)
// match semantics, x* results in an incorrect preference order
// when computing the transitive closure of states if and only if
// 'x' can match the empty string. So instead, we compile x* as
// (x+)?, which preserves the correct preference order.
//
// See: https://github.com/rust-lang/regex/issues/779
let compiled = self.c(hir)?;
let plus =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
self.patch(compiled.end, plus)?;
self.patch(plus, compiled.start)?;
let question =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
let empty = self.add_empty()?;
self.patch(question, compiled.start)?;
self.patch(question, empty)?;
self.patch(plus, empty)?;
Ok(ThompsonRef { start: question, end: empty })
} else if n == 1 {
let compiled = self.c(hir)?;
let splits =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
self.patch(compiled.end, splits)?;
self.patch(splits, compiled.start)?;
Ok(ThompsonRef { start: compiled.start, end: splits })
} else {
let prefix = self.c_exactly(hir, n - 1)?;
let last = self.c(hir)?;
let splits =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
self.patch(prefix.end, last.start)?;
self.patch(last.end, splits)?;
self.patch(splits, last.start)?;
Ok(ThompsonRef { start: prefix.start, end: splits })
}
}
/// Compile the given expression such that it may be matched zero or one
/// times.
///
/// When `greedy` is true, then the preference is for the expression to
/// match as much as possible. Otherwise, it will match as little as
/// possible.
fn c_zero_or_one(
&self,
hir: &Hir,
greedy: bool,
) -> Result<ThompsonRef, Error> {
let splits =
self.add(State::Splits { targets: vec![], reverse: !greedy })?;
let compiled = self.c(hir)?;
let empty = self.add_empty()?;
self.patch(splits, compiled.start)?;
self.patch(splits, empty)?;
self.patch(compiled.end, empty)?;
Ok(ThompsonRef { start: splits, end: empty })
}
/// Compile the given HIR expression exactly `n` times.
fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> {
self.c_concat((0..n).map(|_| self.c(hir)))
}
/// Compile the given expression and insert capturing states at the
/// beginning and end of it. The slot for the capture states is computed
/// from the index.
fn c_capture(
&self,
index: u32,
name: Option<&str>,
hir: &Hir,
) -> Result<ThompsonRef, Error> {
// For discontiguous indices, push placeholders for earlier capture
// groups that weren't explicitly added. This can happen, for example,
// with patterns like '(a){0}(a)' where '(a){0}' is completely removed
// from the pattern.
let existing_groups_len = self.nfa.borrow().cap_index_to_name.len();
for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) {
self.nfa.borrow_mut().cap_index_to_name.push(None);
}
if index.as_usize() >= existing_groups_len {
if let Some(name) = name {
let name = Arc::from(name);
let mut nfa = self.nfa.borrow_mut();
nfa.cap_name_to_index.insert(Arc::clone(&name), index);
nfa.cap_index_to_name.push(Some(Arc::clone(&name)));
// This is an approximation.
nfa.memory_extra += name.len() + size_of::<u32>();
} else {
self.nfa.borrow_mut().cap_index_to_name.push(None);
}
}
let Some(slot) = index.checked_mul(2) else {
return Err(Error::new("capture group slots exhausted"));
};
let start = self.add(State::Capture { target: 0, slot })?;
let inner = self.c(hir)?;
let Some(slot) = slot.checked_add(1) else {
return Err(Error::new("capture group slots exhausted"));
};
let end = self.add(State::Capture { target: 0, slot })?;
self.patch(start, inner.start)?;
self.patch(inner.end, end)?;
Ok(ThompsonRef { start, end })
}
/// Compile a concatenation of the sub-expressions yielded by the given
/// iterator. If the iterator yields no elements, then this compiles down
/// to an "empty" state that always matches.
fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
where
I: Iterator<Item = Result<ThompsonRef, Error>>,
{
let ThompsonRef { start, mut end } = match it.next() {
Some(result) => result?,
None => return self.c_empty(),
};
for result in it {
let compiled = result?;
self.patch(end, compiled.start)?;
end = compiled.end;
}
Ok(ThompsonRef { start, end })
}
/// Compile an alternation, where each element yielded by the given
/// iterator represents an item in the alternation. If the iterator yields
/// no elements, then this compiles down to a "fail" state.
///
/// In an alternation, expressions appearing earlier are "preferred" at
/// match time over expressions appearing later. (This is currently always
/// true, as this crate only supports leftmost-first semantics.)
fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
where
I: Iterator<Item = Result<ThompsonRef, Error>>,
{
let first = match it.next() {
None => return self.c_fail(),
Some(result) => result?,
};
let second = match it.next() {
None => return Ok(first),
Some(result) => result?,
};
let splits =
self.add(State::Splits { targets: vec![], reverse: false })?;
let end = self.add_empty()?;
self.patch(splits, first.start)?;
self.patch(first.end, end)?;
self.patch(splits, second.start)?;
self.patch(second.end, end)?;
for result in it {
let compiled = result?;
self.patch(splits, compiled.start)?;
self.patch(compiled.end, end)?;
}
Ok(ThompsonRef { start: splits, end })
}
/// A convenience routine for adding an empty state, also known as an
/// unconditional epsilon transition. These are quite useful for making
/// NFA construction simpler.
///
/// (In the regex crate, we do a second pass to remove these, but don't
/// bother with that here.)
fn add_empty(&self) -> Result<StateID, Error> {
self.add(State::Goto { target: 0, look: None })
}
/// The common implementation of "add a state." It handles the common
/// error cases of state ID exhausting (by owning state ID allocation) and
/// whether the size limit has been exceeded.
fn add(&self, state: State) -> Result<StateID, Error> {
let id = u32::try_from(self.nfa.borrow().states.len())
.map_err(|_| Error::new("exhausted state IDs, too many states"))?;
self.nfa.borrow_mut().memory_extra += state.memory_usage();
self.nfa.borrow_mut().states.push(state);
self.check_size_limit()?;
Ok(id)
}
/// Add a transition from one state to another.
///
/// This routine is called "patch" since it is very common to add the
/// states you want, typically with "dummy" state ID transitions, and then
/// "patch" in the real state IDs later. This is because you don't always
/// know all of the necessary state IDs to add because they might not
/// exist yet.
///
/// # Errors
///
/// This may error if patching leads to an increase in heap usage beyond
/// the configured size limit. Heap usage only grows when patching adds a
/// new transition (as in the case of a "splits" state).
fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
let mut new_memory_extra = self.nfa.borrow().memory_extra;
match self.nfa.borrow_mut().states[from.as_usize()] {
State::Char { ref mut target, .. } => {
*target = to;
}
State::Ranges { ref mut target, .. } => {
*target = to;
}
State::Splits { ref mut targets, .. } => {
targets.push(to);
new_memory_extra += size_of::<StateID>();
}
State::Goto { ref mut target, .. } => {
*target = to;
}
State::Capture { ref mut target, .. } => {
*target = to;
}
State::Fail | State::Match => {}
}
if new_memory_extra != self.nfa.borrow().memory_extra {
self.nfa.borrow_mut().memory_extra = new_memory_extra;
self.check_size_limit()?;
}
Ok(())
}
/// Checks that the current heap memory usage of the NFA being compiled
/// doesn't exceed the configured size limit. If it does, an error is
/// returned.
fn check_size_limit(&self) -> Result<(), Error> {
if let Some(limit) = self.config.size_limit {
if self.nfa.borrow().memory_usage() > limit {
return Err(Error::new("compiled regex exceeded size limit"));
}
}
Ok(())
}
}
/// A value that represents the result of compiling a sub-expression of a
/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
/// has an initial state at `start` and a final state at `end`.
#[derive(Clone, Copy, Debug)]
struct ThompsonRef {
start: StateID,
end: StateID,
}