blob: 7572f9f105d413cd97faabe5abaa165c3e4a85f3 [file] [log] [blame]
use alloc::{sync::Arc, vec, vec::Vec};
use crate::{
nfa::thompson::{self, Error, State, NFA},
util::{
id::{PatternID, StateID},
matchtypes::MultiMatch,
sparse_set::SparseSet,
},
};
#[derive(Clone, Copy, Debug, Default)]
pub struct Config {
anchored: Option<bool>,
utf8: Option<bool>,
}
impl Config {
/// Return a new default PikeVM configuration.
pub fn new() -> Config {
Config::default()
}
pub fn anchored(mut self, yes: bool) -> Config {
self.anchored = Some(yes);
self
}
pub fn utf8(mut self, yes: bool) -> Config {
self.utf8 = Some(yes);
self
}
pub fn get_anchored(&self) -> bool {
self.anchored.unwrap_or(false)
}
pub fn get_utf8(&self) -> bool {
self.utf8.unwrap_or(true)
}
pub(crate) fn overwrite(self, o: Config) -> Config {
Config {
anchored: o.anchored.or(self.anchored),
utf8: o.utf8.or(self.utf8),
}
}
}
/// A builder for a PikeVM.
#[derive(Clone, Debug)]
pub struct Builder {
config: Config,
thompson: thompson::Builder,
}
impl Builder {
/// Create a new PikeVM builder with its default configuration.
pub fn new() -> Builder {
Builder {
config: Config::default(),
thompson: thompson::Builder::new(),
}
}
pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
self.build_many(&[pattern])
}
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<PikeVM, Error> {
let nfa = self.thompson.build_many(patterns)?;
self.build_from_nfa(Arc::new(nfa))
}
pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
// TODO: Check that this is correct.
// if !cfg!(all(
// feature = "dfa",
// feature = "syntax",
// feature = "unicode-perl"
// )) {
if !cfg!(feature = "syntax") {
if nfa.has_word_boundary_unicode() {
return Err(Error::unicode_word_unavailable());
}
}
Ok(PikeVM { config: self.config, nfa })
}
pub fn configure(&mut self, config: Config) -> &mut Builder {
self.config = self.config.overwrite(config);
self
}
/// Set the syntax configuration for this builder using
/// [`SyntaxConfig`](crate::SyntaxConfig).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
/// These settings only apply when constructing a PikeVM directly from a
/// pattern.
pub fn syntax(
&mut self,
config: crate::util::syntax::SyntaxConfig,
) -> &mut Builder {
self.thompson.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
///
/// This permits setting things like if additional time should be spent
/// shrinking the size of the NFA.
///
/// These settings only apply when constructing a PikeVM directly from a
/// pattern.
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.thompson.configure(config);
self
}
}
#[derive(Clone, Debug)]
pub struct PikeVM {
config: Config,
nfa: Arc<NFA>,
}
impl PikeVM {
pub fn new(pattern: &str) -> Result<PikeVM, Error> {
PikeVM::builder().build(pattern)
}
pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
PikeVM::builder().build_many(patterns)
}
pub fn config() -> Config {
Config::new()
}
pub fn builder() -> Builder {
Builder::new()
}
pub fn create_cache(&self) -> Cache {
Cache::new(self.nfa())
}
pub fn create_captures(&self) -> Captures {
Captures::new(self.nfa())
}
pub fn nfa(&self) -> &Arc<NFA> {
&self.nfa
}
pub fn find_leftmost_iter<'r, 'c, 't>(
&'r self,
cache: &'c mut Cache,
haystack: &'t [u8],
) -> FindLeftmostMatches<'r, 'c, 't> {
FindLeftmostMatches::new(self, cache, haystack)
}
// BREADCRUMBS:
//
// 1) Don't forget about prefilters.
//
// 2) Consider the case of using a PikeVM with an NFA that has Capture
// states, but where we don't want to track capturing groups (other than
// group 0). This potentially saves a lot of copying around and what not. I
// believe the current regex crate does this, for example. The interesting
// bit here is how to handle the case of multiple patterns...
//
// 3) Permit the caller to specify a pattern ID to run an anchored-only
// search on.
//
// 4) How to do overlapping? The way multi-regex support works in the regex
// crate currently is to run the PikeVM until either we reach the end of
// the haystack or when we know all regexes have matched. The latter case
// is probably quite rare, so the common case is likely that we're always
// searching the entire input. The question is: can we emulate that with
// our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
// all we need to do is provide an overlapping API on the PikeVM that
// roughly matches the ones we provide on DFAs. For those APIs, the only
// thing they need over non-overlapping APIs is "caller state." For DFAs,
// the caller state is simple: it contains the last state visited and the
// last match reported. For the PikeVM (and NFAs in general), the "last
// state" is actually a *set* of NFA states. So I think what happens here
// is that we can just force the `Cache` to subsume this role. We'll still
// need some additional state to track the last match reported though.
// Because when two or more patterns match at the same location, we need a
// way to know to iterate over them. Although maybe it's not match index we
// need, but the state index of the last NFA state processed in the cache.
// Then we just pick up where we left off. There might be another match
// state, in which case, we report it.
pub fn find_leftmost_at(
&self,
cache: &mut Cache,
haystack: &[u8],
start: usize,
end: usize,
caps: &mut Captures,
) -> Option<MultiMatch> {
let anchored =
self.config.get_anchored() || self.nfa.is_always_start_anchored();
let mut at = start;
let mut matched_pid = None;
cache.clear();
'LOOP: loop {
if cache.clist.set.is_empty() {
if matched_pid.is_some() || (anchored && at > start) {
break 'LOOP;
}
// TODO: prefilter
}
if (!anchored && matched_pid.is_none())
|| cache.clist.set.is_empty()
{
self.epsilon_closure(
&mut cache.clist,
&mut caps.slots,
&mut cache.stack,
self.nfa.start_anchored(),
haystack,
at,
);
}
for i in 0..cache.clist.set.len() {
let sid = cache.clist.set.get(i);
let pid = match self.step(
&mut cache.nlist,
&mut caps.slots,
cache.clist.caps(sid),
&mut cache.stack,
sid,
haystack,
at,
) {
None => continue,
Some(pid) => pid,
};
matched_pid = Some(pid);
break;
}
if at >= end {
break;
}
at += 1;
cache.swap();
cache.nlist.set.clear();
}
matched_pid.map(|pid| {
let slots = self.nfa.pattern_slots(pid);
let (start, end) = (slots.start, slots.start + 1);
MultiMatch::new(
pid,
caps.slots[start].unwrap(),
caps.slots[end].unwrap(),
)
})
}
#[inline(always)]
fn step(
&self,
nlist: &mut Threads,
slots: &mut [Slot],
thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
sid: StateID,
haystack: &[u8],
at: usize,
) -> Option<PatternID> {
match *self.nfa.state(sid) {
State::Fail
| State::Look { .. }
| State::Union { .. }
| State::Capture { .. } => None,
State::Range { ref range } => {
if range.matches(haystack, at) {
self.epsilon_closure(
nlist,
thread_caps,
stack,
range.next,
haystack,
at + 1,
);
}
None
}
State::Sparse(ref sparse) => {
if let Some(next) = sparse.matches(haystack, at) {
self.epsilon_closure(
nlist,
thread_caps,
stack,
next,
haystack,
at + 1,
);
}
None
}
State::Match { id } => {
slots.copy_from_slice(thread_caps);
Some(id)
}
}
}
#[inline(always)]
fn epsilon_closure(
&self,
nlist: &mut Threads,
thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
sid: StateID,
haystack: &[u8],
at: usize,
) {
stack.push(FollowEpsilon::StateID(sid));
while let Some(frame) = stack.pop() {
match frame {
FollowEpsilon::StateID(sid) => {
self.epsilon_closure_step(
nlist,
thread_caps,
stack,
sid,
haystack,
at,
);
}
FollowEpsilon::Capture { slot, pos } => {
thread_caps[slot] = pos;
}
}
}
}
#[inline(always)]
fn epsilon_closure_step(
&self,
nlist: &mut Threads,
thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
mut sid: StateID,
haystack: &[u8],
at: usize,
) {
loop {
if !nlist.set.insert(sid) {
return;
}
match *self.nfa.state(sid) {
State::Fail
| State::Range { .. }
| State::Sparse { .. }
| State::Match { .. } => {
let t = &mut nlist.caps(sid);
t.copy_from_slice(thread_caps);
return;
}
State::Look { look, next } => {
if !look.matches(haystack, at) {
return;
}
sid = next;
}
State::Union { ref alternates } => {
sid = match alternates.get(0) {
None => return,
Some(&sid) => sid,
};
stack.extend(
alternates[1..]
.iter()
.copied()
.rev()
.map(FollowEpsilon::StateID),
);
}
State::Capture { next, slot } => {
if slot < thread_caps.len() {
stack.push(FollowEpsilon::Capture {
slot,
pos: thread_caps[slot],
});
thread_caps[slot] = Some(at);
}
sid = next;
}
}
}
}
}
/// An iterator over all non-overlapping leftmost matches for a particular
/// infallible search.
///
/// The iterator yields a [`MultiMatch`] value until no more matches could be
/// found. If the underlying search returns an error, then this panics.
///
/// The lifetime variables are as follows:
///
/// * `'r` is the lifetime of the regular expression itself.
/// * `'c` is the lifetime of the mutable cache used during search.
/// * `'t` is the lifetime of the text being searched.
#[derive(Debug)]
pub struct FindLeftmostMatches<'r, 'c, 't> {
vm: &'r PikeVM,
cache: &'c mut Cache,
// scanner: Option<prefilter::Scanner<'r>>,
text: &'t [u8],
last_end: usize,
last_match: Option<usize>,
}
impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
fn new(
vm: &'r PikeVM,
cache: &'c mut Cache,
text: &'t [u8],
) -> FindLeftmostMatches<'r, 'c, 't> {
FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
}
}
impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
// type Item = Captures;
type Item = MultiMatch;
// fn next(&mut self) -> Option<Captures> {
fn next(&mut self) -> Option<MultiMatch> {
if self.last_end > self.text.len() {
return None;
}
let mut caps = self.vm.create_captures();
let m = self.vm.find_leftmost_at(
self.cache,
self.text,
self.last_end,
self.text.len(),
&mut caps,
)?;
if m.is_empty() {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = if self.vm.config.get_utf8() {
crate::util::next_utf8(self.text, m.end())
} else {
m.end() + 1
};
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end()) == self.last_match {
return self.next();
}
} else {
self.last_end = m.end();
}
self.last_match = Some(m.end());
Some(m)
}
}
#[derive(Clone, Debug)]
pub struct Captures {
slots: Vec<Slot>,
}
impl Captures {
pub fn new(nfa: &NFA) -> Captures {
Captures { slots: vec![None; nfa.capture_slot_len()] }
}
}
#[derive(Clone, Debug)]
pub struct Cache {
stack: Vec<FollowEpsilon>,
clist: Threads,
nlist: Threads,
}
type Slot = Option<usize>;
#[derive(Clone, Debug)]
struct Threads {
set: SparseSet,
caps: Vec<Slot>,
slots_per_thread: usize,
}
#[derive(Clone, Debug)]
enum FollowEpsilon {
StateID(StateID),
Capture { slot: usize, pos: Slot },
}
impl Cache {
pub fn new(nfa: &NFA) -> Cache {
Cache {
stack: vec![],
clist: Threads::new(nfa),
nlist: Threads::new(nfa),
}
}
fn clear(&mut self) {
self.stack.clear();
self.clist.set.clear();
self.nlist.set.clear();
}
fn swap(&mut self) {
core::mem::swap(&mut self.clist, &mut self.nlist);
}
}
impl Threads {
fn new(nfa: &NFA) -> Threads {
let mut threads = Threads {
set: SparseSet::new(0),
caps: vec![],
slots_per_thread: 0,
};
threads.resize(nfa);
threads
}
fn resize(&mut self, nfa: &NFA) {
if nfa.states().len() == self.set.capacity() {
return;
}
self.slots_per_thread = nfa.capture_slot_len();
self.set.resize(nfa.states().len());
self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
}
fn caps(&mut self, sid: StateID) -> &mut [Slot] {
let i = sid.as_usize() * self.slots_per_thread;
&mut self.caps[i..i + self.slots_per_thread]
}
}