Blame - src/determinize.rs - platform/external/rust/crates/regex-automata

blob: cf0c285857ca92ab84cc11eb22a2ab4fc9d36f8f [file] [log] [blame]

Jakub Kotur	3bceaeb	2020-12-21 17:28:16 +0100	[diff] [blame]	1	use std::collections::HashMap;
				2	use std::mem;
				3	use std::rc::Rc;
				4
				5	use dense;
				6	use error::Result;
				7	use nfa::{self, NFA};
				8	use sparse_set::SparseSet;
				9	use state_id::{dead_id, StateID};
				10
				11	type DFARepr<S> = dense::Repr<Vec<S>, S>;
				12
				13	/// A determinizer converts an NFA to a DFA.
				14	///
				15	/// This determinizer follows the typical powerset construction, where each
				16	/// DFA state is comprised of one or more NFA states. In the worst case, there
				17	/// is one DFA state for every possible combination of NFA states. In practice,
				18	/// this only happens in certain conditions, typically when there are bounded
				19	/// repetitions.
				20	///
				21	/// The type variable `S` refers to the chosen state identifier representation
				22	/// used for the DFA.
				23	///
				24	/// The lifetime variable `'a` refers to the lifetime of the NFA being
				25	/// converted to a DFA.
				26	#[derive(Debug)]
				27	pub(crate) struct Determinizer<'a, S: StateID> {
				28	/// The NFA we're converting into a DFA.
				29	nfa: &'a NFA,
				30	/// The DFA we're building.
				31	dfa: DFARepr<S>,
				32	/// Each DFA state being built is defined as an ordered set of NFA
				33	/// states, along with a flag indicating whether the state is a match
				34	/// state or not.
				35	///
				36	/// This is never empty. The first state is always a dummy state such that
				37	/// a state id == 0 corresponds to a dead state.
				38	builder_states: Vec<Rc<State>>,
				39	/// A cache of DFA states that already exist and can be easily looked up
				40	/// via ordered sets of NFA states.
				41	cache: HashMap<Rc<State>, S>,
				42	/// Scratch space for a stack of NFA states to visit, for depth first
				43	/// visiting without recursion.
				44	stack: Vec<nfa::StateID>,
				45	/// Scratch space for storing an ordered sequence of NFA states, for
				46	/// amortizing allocation.
				47	scratch_nfa_states: Vec<nfa::StateID>,
				48	/// Whether to build a DFA that finds the longest possible match.
				49	longest_match: bool,
				50	}
				51
				52	/// An intermediate representation for a DFA state during determinization.
				53	#[derive(Debug, Eq, Hash, PartialEq)]
				54	struct State {
				55	/// Whether this state is a match state or not.
				56	is_match: bool,
				57	/// An ordered sequence of NFA states that make up this DFA state.
				58	nfa_states: Vec<nfa::StateID>,
				59	}
				60
				61	impl<'a, S: StateID> Determinizer<'a, S> {
				62	/// Create a new determinizer for converting the given NFA to a DFA.
				63	pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
				64	let dead = Rc::new(State::dead());
				65	let mut cache = HashMap::default();
				66	cache.insert(dead.clone(), dead_id());
				67
				68	Determinizer {
				69	nfa,
				70	dfa: DFARepr::empty().anchored(nfa.is_anchored()),
				71	builder_states: vec![dead],
				72	cache,
				73	stack: vec![],
				74	scratch_nfa_states: vec![],
				75	longest_match: false,
				76	}
				77	}
				78
				79	/// Instruct the determinizer to use equivalence classes as the transition
				80	/// alphabet instead of all possible byte values.
				81	pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
				82	let byte_classes = self.nfa.byte_classes().clone();
				83	self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
				84	.anchored(self.nfa.is_anchored());
				85	self
				86	}
				87
				88	/// Instruct the determinizer to build a DFA that recognizes the longest
				89	/// possible match instead of the leftmost first match. This is useful when
				90	/// constructing reverse DFAs for finding the start of a match.
				91	pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
				92	self.longest_match = yes;
				93	self
				94	}
				95
				96	/// Build the DFA. If there was a problem constructing the DFA (e.g., if
				97	/// the chosen state identifier representation is too small), then an error
				98	/// is returned.
				99	pub fn build(mut self) -> Result<DFARepr<S>> {
				100	let representative_bytes: Vec<u8> =
				101	self.dfa.byte_classes().representatives().collect();
				102	let mut sparse = self.new_sparse_set();
				103	let mut uncompiled = vec![self.add_start(&mut sparse)?];
				104	while let Some(dfa_id) = uncompiled.pop() {
				105	for &b in &representative_bytes {
				106	let (next_dfa_id, is_new) =
				107	self.cached_state(dfa_id, b, &mut sparse)?;
				108	self.dfa.add_transition(dfa_id, b, next_dfa_id);
				109	if is_new {
				110	uncompiled.push(next_dfa_id);
				111	}
				112	}
				113	}
				114
				115	// At this point, we shuffle the matching states in the final DFA to
				116	// the beginning. This permits a DFA's match loop to detect a match
				117	// condition by merely inspecting the current state's identifier, and
				118	// avoids the need for any additional auxiliary storage.
				119	let is_match: Vec<bool> =
				120	self.builder_states.iter().map(\|s\| s.is_match).collect();
				121	self.dfa.shuffle_match_states(&is_match);
				122	Ok(self.dfa)
				123	}
				124
				125	/// Return the identifier for the next DFA state given an existing DFA
				126	/// state and an input byte. If the next DFA state already exists, then
				127	/// return its identifier from the cache. Otherwise, build the state, cache
				128	/// it and return its identifier.
				129	///
				130	/// The given sparse set is used for scratch space. It must have a capacity
				131	/// equivalent to the total number of NFA states, but its contents are
				132	/// otherwise unspecified.
				133	///
				134	/// This routine returns a boolean indicating whether a new state was
				135	/// built. If a new state is built, then the caller needs to add it to its
				136	/// frontier of uncompiled DFA states to compute transitions for.
				137	fn cached_state(
				138	&mut self,
				139	dfa_id: S,
				140	b: u8,
				141	sparse: &mut SparseSet,
				142	) -> Result<(S, bool)> {
				143	sparse.clear();
				144	// Compute the set of all reachable NFA states, including epsilons.
				145	self.next(dfa_id, b, sparse);
				146	// Build a candidate state and check if it has already been built.
				147	let state = self.new_state(sparse);
				148	if let Some(&cached_id) = self.cache.get(&state) {
				149	// Since we have a cached state, put the constructed state's
				150	// memory back into our scratch space, so that it can be reused.
Joel Galenson	8249a3d	2021-06-21 14:01:00 -0700	[diff] [blame]	151	let _ =
				152	mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
Jakub Kotur	3bceaeb	2020-12-21 17:28:16 +0100	[diff] [blame]	153	return Ok((cached_id, false));
				154	}
				155	// Nothing was in the cache, so add this state to the cache.
				156	self.add_state(state).map(\|s\| (s, true))
				157	}
				158
				159	/// Compute the set of all eachable NFA states, including the full epsilon
				160	/// closure, from a DFA state for a single byte of input.
				161	fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
				162	next_nfa_states.clear();
				163	for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
				164	let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
				165	match *self.nfa.state(nfa_id) {
				166	nfa::State::Union { .. }
				167	\| nfa::State::Fail
				168	\| nfa::State::Match => {}
				169	nfa::State::Range { range: ref r } => {
				170	if r.start <= b && b <= r.end {
				171	self.epsilon_closure(r.next, next_nfa_states);
				172	}
				173	}
				174	nfa::State::Sparse { ref ranges } => {
				175	for r in ranges.iter() {
				176	if r.start > b {
				177	break;
				178	} else if r.start <= b && b <= r.end {
				179	self.epsilon_closure(r.next, next_nfa_states);
				180	break;
				181	}
				182	}
				183	}
				184	}
				185	}
				186	}
				187
				188	/// Compute the epsilon closure for the given NFA state.
				189	fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
				190	if !self.nfa.state(start).is_epsilon() {
				191	set.insert(start);
				192	return;
				193	}
				194
				195	self.stack.push(start);
				196	while let Some(mut id) = self.stack.pop() {
				197	loop {
				198	if set.contains(id) {
				199	break;
				200	}
				201	set.insert(id);
				202	match *self.nfa.state(id) {
				203	nfa::State::Range { .. }
				204	\| nfa::State::Sparse { .. }
				205	\| nfa::State::Fail
				206	\| nfa::State::Match => break,
				207	nfa::State::Union { ref alternates } => {
				208	id = match alternates.get(0) {
				209	None => break,
				210	Some(&id) => id,
				211	};
				212	self.stack.extend(alternates[1..].iter().rev());
				213	}
				214	}
				215	}
				216	}
				217	}
				218
				219	/// Compute the initial DFA state and return its identifier.
				220	///
				221	/// The sparse set given is used for scratch space, and must have capacity
				222	/// equal to the total number of NFA states. Its contents are unspecified.
				223	fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
				224	sparse.clear();
				225	self.epsilon_closure(self.nfa.start(), sparse);
				226	let state = self.new_state(&sparse);
				227	let id = self.add_state(state)?;
				228	self.dfa.set_start_state(id);
				229	Ok(id)
				230	}
				231
				232	/// Add the given state to the DFA and make it available in the cache.
				233	///
				234	/// The state initially has no transitions. That is, it transitions to the
				235	/// dead state for all possible inputs.
				236	fn add_state(&mut self, state: State) -> Result<S> {
				237	let id = self.dfa.add_empty_state()?;
				238	let rstate = Rc::new(state);
				239	self.builder_states.push(rstate.clone());
				240	self.cache.insert(rstate, id);
				241	Ok(id)
				242	}
				243
				244	/// Convert the given set of ordered NFA states to a DFA state.
				245	fn new_state(&mut self, set: &SparseSet) -> State {
				246	let mut state = State {
				247	is_match: false,
				248	nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
				249	};
				250	state.nfa_states.clear();
				251
				252	for &id in set {
				253	match *self.nfa.state(id) {
				254	nfa::State::Range { .. } => {
				255	state.nfa_states.push(id);
				256	}
				257	nfa::State::Sparse { .. } => {
				258	state.nfa_states.push(id);
				259	}
				260	nfa::State::Fail => {
				261	break;
				262	}
				263	nfa::State::Match => {
				264	state.is_match = true;
				265	if !self.longest_match {
				266	break;
				267	}
				268	}
				269	nfa::State::Union { .. } => {}
				270	}
				271	}
				272	state
				273	}
				274
				275	/// Create a new sparse set with enough capacity to hold all NFA states.
				276	fn new_sparse_set(&self) -> SparseSet {
				277	SparseSet::new(self.nfa.len())
				278	}
				279	}
				280
				281	impl State {
				282	/// Create a new empty dead state.
				283	fn dead() -> State {
				284	State { nfa_states: vec![], is_match: false }
				285	}
				286	}