src/determinize.rs - platform/external/rust/crates/regex-automata - Git at Google

 use std::collections::HashMap;
 use std::mem;
 use std::rc::Rc;

 use dense;
 use error::Result;
 use nfa::{self, NFA};
 use sparse_set::SparseSet;
 use state_id::{dead_id, StateID};

 type DFARepr<S> = dense::Repr<Vec<S>, S>;

 /// A determinizer converts an NFA to a DFA.
 ///
 /// This determinizer follows the typical powerset construction, where each
 /// DFA state is comprised of one or more NFA states. In the worst case, there
 /// is one DFA state for every possible combination of NFA states. In practice,
 /// this only happens in certain conditions, typically when there are bounded
 /// repetitions.
 ///
 /// The type variable `S` refers to the chosen state identifier representation
 /// used for the DFA.
 ///
 /// The lifetime variable `'a` refers to the lifetime of the NFA being
 /// converted to a DFA.
 #[derive(Debug)]
 pub(crate) struct Determinizer<'a, S: StateID> {
     /// The NFA we're converting into a DFA.
     nfa: &'a NFA,
     /// The DFA we're building.
     dfa: DFARepr<S>,
     /// Each DFA state being built is defined as an *ordered* set of NFA
     /// states, along with a flag indicating whether the state is a match
     /// state or not.
     ///
     /// This is never empty. The first state is always a dummy state such that
     /// a state id == 0 corresponds to a dead state.
     builder_states: Vec<Rc<State>>,
     /// A cache of DFA states that already exist and can be easily looked up
     /// via ordered sets of NFA states.
     cache: HashMap<Rc<State>, S>,
     /// Scratch space for a stack of NFA states to visit, for depth first
     /// visiting without recursion.
     stack: Vec<nfa::StateID>,
     /// Scratch space for storing an ordered sequence of NFA states, for
     /// amortizing allocation.
     scratch_nfa_states: Vec<nfa::StateID>,
     /// Whether to build a DFA that finds the longest possible match.
     longest_match: bool,
 }

 /// An intermediate representation for a DFA state during determinization.
 #[derive(Debug, Eq, Hash, PartialEq)]
 struct State {
     /// Whether this state is a match state or not.
     is_match: bool,
     /// An ordered sequence of NFA states that make up this DFA state.
     nfa_states: Vec<nfa::StateID>,
 }

 impl<'a, S: StateID> Determinizer<'a, S> {
     /// Create a new determinizer for converting the given NFA to a DFA.
     pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
         let dead = Rc::new(State::dead());
         let mut cache = HashMap::default();
         cache.insert(dead.clone(), dead_id());

         Determinizer {
             nfa,
             dfa: DFARepr::empty().anchored(nfa.is_anchored()),
             builder_states: vec![dead],
             cache,
             stack: vec![],
             scratch_nfa_states: vec![],
             longest_match: false,
         }
     }

     /// Instruct the determinizer to use equivalence classes as the transition
     /// alphabet instead of all possible byte values.
     pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
         let byte_classes = self.nfa.byte_classes().clone();
         self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
             .anchored(self.nfa.is_anchored());
         self
     }

     /// Instruct the determinizer to build a DFA that recognizes the longest
     /// possible match instead of the leftmost first match. This is useful when
     /// constructing reverse DFAs for finding the start of a match.
     pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
         self.longest_match = yes;
         self
     }

     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
     /// the chosen state identifier representation is too small), then an error
     /// is returned.
     pub fn build(mut self) -> Result<DFARepr<S>> {
         let representative_bytes: Vec<u8> =
             self.dfa.byte_classes().representatives().collect();
         let mut sparse = self.new_sparse_set();
         let mut uncompiled = vec![self.add_start(&mut sparse)?];
         while let Some(dfa_id) = uncompiled.pop() {
             for &b in &representative_bytes {
                 let (next_dfa_id, is_new) =
                     self.cached_state(dfa_id, b, &mut sparse)?;
                 self.dfa.add_transition(dfa_id, b, next_dfa_id);
                 if is_new {
                     uncompiled.push(next_dfa_id);
                 }
             }
         }

         // At this point, we shuffle the matching states in the final DFA to
         // the beginning. This permits a DFA's match loop to detect a match
         // condition by merely inspecting the current state's identifier, and
         // avoids the need for any additional auxiliary storage.
         let is_match: Vec<bool> =
             self.builder_states.iter().map(|s| s.is_match).collect();
         self.dfa.shuffle_match_states(&is_match);
         Ok(self.dfa)
     }

     /// Return the identifier for the next DFA state given an existing DFA
     /// state and an input byte. If the next DFA state already exists, then
     /// return its identifier from the cache. Otherwise, build the state, cache
     /// it and return its identifier.
     ///
     /// The given sparse set is used for scratch space. It must have a capacity
     /// equivalent to the total number of NFA states, but its contents are
     /// otherwise unspecified.
     ///
     /// This routine returns a boolean indicating whether a new state was
     /// built. If a new state is built, then the caller needs to add it to its
     /// frontier of uncompiled DFA states to compute transitions for.
     fn cached_state(
         &mut self,
         dfa_id: S,
         b: u8,
         sparse: &mut SparseSet,
     ) -> Result<(S, bool)> {
         sparse.clear();
         // Compute the set of all reachable NFA states, including epsilons.
         self.next(dfa_id, b, sparse);
         // Build a candidate state and check if it has already been built.
         let state = self.new_state(sparse);
         if let Some(&cached_id) = self.cache.get(&state) {
             // Since we have a cached state, put the constructed state's
             // memory back into our scratch space, so that it can be reused.
             mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
             return Ok((cached_id, false));
         }
         // Nothing was in the cache, so add this state to the cache.
         self.add_state(state).map(|s| (s, true))
     }

     /// Compute the set of all eachable NFA states, including the full epsilon
     /// closure, from a DFA state for a single byte of input.
     fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
         next_nfa_states.clear();
         for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
             let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
             match *self.nfa.state(nfa_id) {
                 nfa::State::Union { .. }
                 | nfa::State::Fail
                 | nfa::State::Match => {}
                 nfa::State::Range { range: ref r } => {
                     if r.start <= b && b <= r.end {
                         self.epsilon_closure(r.next, next_nfa_states);
                     }
                 }
                 nfa::State::Sparse { ref ranges } => {
                     for r in ranges.iter() {
                         if r.start > b {
                             break;
                         } else if r.start <= b && b <= r.end {
                             self.epsilon_closure(r.next, next_nfa_states);
                             break;
                         }
                     }
                 }
             }
         }
     }

     /// Compute the epsilon closure for the given NFA state.
     fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
         if !self.nfa.state(start).is_epsilon() {
             set.insert(start);
             return;
         }

         self.stack.push(start);
         while let Some(mut id) = self.stack.pop() {
             loop {
                 if set.contains(id) {
                     break;
                 }
                 set.insert(id);
                 match *self.nfa.state(id) {
                     nfa::State::Range { .. }
                     | nfa::State::Sparse { .. }
                     | nfa::State::Fail
                     | nfa::State::Match => break,
                     nfa::State::Union { ref alternates } => {
                         id = match alternates.get(0) {
                             None => break,
                             Some(&id) => id,
                         };
                         self.stack.extend(alternates[1..].iter().rev());
                     }
                 }
             }
         }
     }

     /// Compute the initial DFA state and return its identifier.
     ///
     /// The sparse set given is used for scratch space, and must have capacity
     /// equal to the total number of NFA states. Its contents are unspecified.
     fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
         sparse.clear();
         self.epsilon_closure(self.nfa.start(), sparse);
         let state = self.new_state(&sparse);
         let id = self.add_state(state)?;
         self.dfa.set_start_state(id);
         Ok(id)
     }

     /// Add the given state to the DFA and make it available in the cache.
     ///
     /// The state initially has no transitions. That is, it transitions to the
     /// dead state for all possible inputs.
     fn add_state(&mut self, state: State) -> Result<S> {
         let id = self.dfa.add_empty_state()?;
         let rstate = Rc::new(state);
         self.builder_states.push(rstate.clone());
         self.cache.insert(rstate, id);
         Ok(id)
     }

     /// Convert the given set of ordered NFA states to a DFA state.
     fn new_state(&mut self, set: &SparseSet) -> State {
         let mut state = State {
             is_match: false,
             nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
         };
         state.nfa_states.clear();

         for &id in set {
             match *self.nfa.state(id) {
                 nfa::State::Range { .. } => {
                     state.nfa_states.push(id);
                 }
                 nfa::State::Sparse { .. } => {
                     state.nfa_states.push(id);
                 }
                 nfa::State::Fail => {
                     break;
                 }
                 nfa::State::Match => {
                     state.is_match = true;
                     if !self.longest_match {
                         break;
                     }
                 }
                 nfa::State::Union { .. } => {}
             }
         }
         state
     }

     /// Create a new sparse set with enough capacity to hold all NFA states.
     fn new_sparse_set(&self) -> SparseSet {
         SparseSet::new(self.nfa.len())
     }
 }

 impl State {
     /// Create a new empty dead state.
     fn dead() -> State {
         State { nfa_states: vec![], is_match: false }
     }
 }
	use std::collections::HashMap;
	use std::mem;
	use std::rc::Rc;

	use dense;
	use error::Result;
	use nfa::{self, NFA};
	use sparse_set::SparseSet;
	use state_id::{dead_id, StateID};

	type DFARepr<S> = dense::Repr<Vec<S>, S>;

	/// A determinizer converts an NFA to a DFA.
	///
	/// This determinizer follows the typical powerset construction, where each
	/// DFA state is comprised of one or more NFA states. In the worst case, there
	/// is one DFA state for every possible combination of NFA states. In practice,
	/// this only happens in certain conditions, typically when there are bounded
	/// repetitions.
	///
	/// The type variable `S` refers to the chosen state identifier representation
	/// used for the DFA.
	///
	/// The lifetime variable `'a` refers to the lifetime of the NFA being
	/// converted to a DFA.
	#[derive(Debug)]
	pub(crate) struct Determinizer<'a, S: StateID> {
	/// The NFA we're converting into a DFA.
	nfa: &'a NFA,
	/// The DFA we're building.
	dfa: DFARepr<S>,
	/// Each DFA state being built is defined as an ordered set of NFA
	/// states, along with a flag indicating whether the state is a match
	/// state or not.
	///
	/// This is never empty. The first state is always a dummy state such that
	/// a state id == 0 corresponds to a dead state.
	builder_states: Vec<Rc<State>>,
	/// A cache of DFA states that already exist and can be easily looked up
	/// via ordered sets of NFA states.
	cache: HashMap<Rc<State>, S>,
	/// Scratch space for a stack of NFA states to visit, for depth first
	/// visiting without recursion.
	stack: Vec<nfa::StateID>,
	/// Scratch space for storing an ordered sequence of NFA states, for
	/// amortizing allocation.
	scratch_nfa_states: Vec<nfa::StateID>,
	/// Whether to build a DFA that finds the longest possible match.
	longest_match: bool,
	}

	/// An intermediate representation for a DFA state during determinization.
	#[derive(Debug, Eq, Hash, PartialEq)]
	struct State {
	/// Whether this state is a match state or not.
	is_match: bool,
	/// An ordered sequence of NFA states that make up this DFA state.
	nfa_states: Vec<nfa::StateID>,
	}

	impl<'a, S: StateID> Determinizer<'a, S> {
	/// Create a new determinizer for converting the given NFA to a DFA.
	pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
	let dead = Rc::new(State::dead());
	let mut cache = HashMap::default();
	cache.insert(dead.clone(), dead_id());

	Determinizer {
	nfa,
	dfa: DFARepr::empty().anchored(nfa.is_anchored()),
	builder_states: vec![dead],
	cache,
	stack: vec![],
	scratch_nfa_states: vec![],
	longest_match: false,
	}
	}

	/// Instruct the determinizer to use equivalence classes as the transition
	/// alphabet instead of all possible byte values.
	pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
	let byte_classes = self.nfa.byte_classes().clone();
	self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
	.anchored(self.nfa.is_anchored());
	self
	}

	/// Instruct the determinizer to build a DFA that recognizes the longest
	/// possible match instead of the leftmost first match. This is useful when
	/// constructing reverse DFAs for finding the start of a match.
	pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
	self.longest_match = yes;
	self
	}

	/// Build the DFA. If there was a problem constructing the DFA (e.g., if
	/// the chosen state identifier representation is too small), then an error
	/// is returned.
	pub fn build(mut self) -> Result<DFARepr<S>> {
	let representative_bytes: Vec<u8> =
	self.dfa.byte_classes().representatives().collect();
	let mut sparse = self.new_sparse_set();
	let mut uncompiled = vec![self.add_start(&mut sparse)?];
	while let Some(dfa_id) = uncompiled.pop() {
	for &b in &representative_bytes {
	let (next_dfa_id, is_new) =
	self.cached_state(dfa_id, b, &mut sparse)?;
	self.dfa.add_transition(dfa_id, b, next_dfa_id);
	if is_new {
	uncompiled.push(next_dfa_id);
	}
	}
	}

	// At this point, we shuffle the matching states in the final DFA to
	// the beginning. This permits a DFA's match loop to detect a match
	// condition by merely inspecting the current state's identifier, and
	// avoids the need for any additional auxiliary storage.
	let is_match: Vec<bool> =
	self.builder_states.iter().map(\|s\| s.is_match).collect();
	self.dfa.shuffle_match_states(&is_match);
	Ok(self.dfa)
	}

	/// Return the identifier for the next DFA state given an existing DFA
	/// state and an input byte. If the next DFA state already exists, then
	/// return its identifier from the cache. Otherwise, build the state, cache
	/// it and return its identifier.
	///
	/// The given sparse set is used for scratch space. It must have a capacity
	/// equivalent to the total number of NFA states, but its contents are
	/// otherwise unspecified.
	///
	/// This routine returns a boolean indicating whether a new state was
	/// built. If a new state is built, then the caller needs to add it to its
	/// frontier of uncompiled DFA states to compute transitions for.
	fn cached_state(
	&mut self,
	dfa_id: S,
	b: u8,
	sparse: &mut SparseSet,
	) -> Result<(S, bool)> {
	sparse.clear();
	// Compute the set of all reachable NFA states, including epsilons.
	self.next(dfa_id, b, sparse);
	// Build a candidate state and check if it has already been built.
	let state = self.new_state(sparse);
	if let Some(&cached_id) = self.cache.get(&state) {
	// Since we have a cached state, put the constructed state's
	// memory back into our scratch space, so that it can be reused.
	mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
	return Ok((cached_id, false));
	}
	// Nothing was in the cache, so add this state to the cache.
	self.add_state(state).map(\|s\| (s, true))
	}

	/// Compute the set of all eachable NFA states, including the full epsilon
	/// closure, from a DFA state for a single byte of input.
	fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
	next_nfa_states.clear();
	for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
	let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
	match *self.nfa.state(nfa_id) {
	nfa::State::Union { .. }
	\| nfa::State::Fail
	\| nfa::State::Match => {}
	nfa::State::Range { range: ref r } => {
	if r.start <= b && b <= r.end {
	self.epsilon_closure(r.next, next_nfa_states);
	}
	}
	nfa::State::Sparse { ref ranges } => {
	for r in ranges.iter() {
	if r.start > b {
	break;
	} else if r.start <= b && b <= r.end {
	self.epsilon_closure(r.next, next_nfa_states);
	break;
	}
	}
	}
	}
	}
	}

	/// Compute the epsilon closure for the given NFA state.
	fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
	if !self.nfa.state(start).is_epsilon() {
	set.insert(start);
	return;
	}

	self.stack.push(start);
	while let Some(mut id) = self.stack.pop() {
	loop {
	if set.contains(id) {
	break;
	}
	set.insert(id);
	match *self.nfa.state(id) {
	nfa::State::Range { .. }
	\| nfa::State::Sparse { .. }
	\| nfa::State::Fail
	\| nfa::State::Match => break,
	nfa::State::Union { ref alternates } => {
	id = match alternates.get(0) {
	None => break,
	Some(&id) => id,
	};
	self.stack.extend(alternates[1..].iter().rev());
	}
	}
	}
	}
	}

	/// Compute the initial DFA state and return its identifier.
	///
	/// The sparse set given is used for scratch space, and must have capacity
	/// equal to the total number of NFA states. Its contents are unspecified.
	fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
	sparse.clear();
	self.epsilon_closure(self.nfa.start(), sparse);
	let state = self.new_state(&sparse);
	let id = self.add_state(state)?;
	self.dfa.set_start_state(id);
	Ok(id)
	}

	/// Add the given state to the DFA and make it available in the cache.
	///
	/// The state initially has no transitions. That is, it transitions to the
	/// dead state for all possible inputs.
	fn add_state(&mut self, state: State) -> Result<S> {
	let id = self.dfa.add_empty_state()?;
	let rstate = Rc::new(state);
	self.builder_states.push(rstate.clone());
	self.cache.insert(rstate, id);
	Ok(id)
	}

	/// Convert the given set of ordered NFA states to a DFA state.
	fn new_state(&mut self, set: &SparseSet) -> State {
	let mut state = State {
	is_match: false,
	nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
	};
	state.nfa_states.clear();

	for &id in set {
	match *self.nfa.state(id) {
	nfa::State::Range { .. } => {
	state.nfa_states.push(id);
	}
	nfa::State::Sparse { .. } => {
	state.nfa_states.push(id);
	}
	nfa::State::Fail => {
	break;
	}
	nfa::State::Match => {
	state.is_match = true;
	if !self.longest_match {
	break;
	}
	}
	nfa::State::Union { .. } => {}
	}
	}
	state
	}

	/// Create a new sparse set with enough capacity to hold all NFA states.
	fn new_sparse_set(&self) -> SparseSet {
	SparseSet::new(self.nfa.len())
	}
	}

	impl State {
	/// Create a new empty dead state.
	fn dead() -> State {
	State { nfa_states: vec![], is_match: false }
	}
	}