src/minimize.rs - platform/external/rust/crates/regex-automata - Git at Google

 use std::cell::RefCell;
 use std::fmt;
 use std::mem;
 use std::rc::Rc;

 use dense;
 use state_id::{dead_id, StateID};

 type DFARepr<S> = dense::Repr<Vec<S>, S>;

 /// An implementation of Hopcroft's algorithm for minimizing DFAs.
 ///
 /// The algorithm implemented here is mostly taken from Wikipedia:
 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
 ///
 /// This code has had some light optimization attention paid to it,
 /// particularly in the form of reducing allocation as much as possible.
 /// However, it is still generally slow. Future optimization work should
 /// probably focus on the bigger picture rather than micro-optimizations. For
 /// example:
 ///
 /// 1. Figure out how to more intelligently create initial partitions. That is,
 ///    Hopcroft's algorithm starts by creating two partitions of DFA states
 ///    that are known to NOT be equivalent: match states and non-match states.
 ///    The algorithm proceeds by progressively refining these partitions into
 ///    smaller partitions. If we could start with more partitions, then we
 ///    could reduce the amount of work that Hopcroft's algorithm needs to do.
 /// 2. For every partition that we visit, we find all incoming transitions to
 ///    every state in the partition for *every* element in the alphabet. (This
 ///    is why using byte classes can significantly decrease minimization times,
 ///    since byte classes shrink the alphabet.) This is quite costly and there
 ///    is perhaps some redundant work being performed depending on the specific
 ///    states in the set. For example, we might be able to only visit some
 ///    elements of the alphabet based on the transitions.
 /// 3. Move parts of minimization into determinization. If minimization has
 ///    fewer states to deal with, then it should run faster. A prime example
 ///    of this might be large Unicode classes, which are generated in way that
 ///    can create a lot of redundant states. (Some work has been done on this
 ///    point during NFA compilation via the algorithm described in the
 ///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
 ///    paper.)
 pub(crate) struct Minimizer<'a, S: 'a> {
     dfa: &'a mut DFARepr<S>,
     in_transitions: Vec<Vec<Vec<S>>>,
     partitions: Vec<StateSet<S>>,
     waiting: Vec<StateSet<S>>,
 }

 impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_struct("Minimizer")
             .field("dfa", &self.dfa)
             .field("in_transitions", &self.in_transitions)
             .field("partitions", &self.partitions)
             .field("waiting", &self.waiting)
             .finish()
     }
 }

 /// A set of states. A state set makes up a single partition in Hopcroft's
 /// algorithm.
 ///
 /// It is represented by an ordered set of state identifiers. We use shared
 /// ownership so that a single state set can be in both the set of partitions
 /// and in the set of waiting sets simultaneously without an additional
 /// allocation. Generally, once a state set is built, it becomes immutable.
 ///
 /// We use this representation because it avoids the overhead of more
 /// traditional set data structures (HashSet/BTreeSet), and also because
 /// computing intersection/subtraction on this representation is especially
 /// fast.
 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
 struct StateSet<S>(Rc<RefCell<Vec<S>>>);

 impl<'a, S: StateID> Minimizer<'a, S> {
     pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
         let in_transitions = Minimizer::incoming_transitions(dfa);
         let partitions = Minimizer::initial_partitions(dfa);
         let waiting = vec![partitions[0].clone()];

         Minimizer { dfa, in_transitions, partitions, waiting }
     }

     pub fn run(mut self) {
         let mut incoming = StateSet::empty();
         let mut scratch1 = StateSet::empty();
         let mut scratch2 = StateSet::empty();
         let mut newparts = vec![];

         while let Some(set) = self.waiting.pop() {
             for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
                 self.find_incoming_to(b, &set, &mut incoming);

                 for p in 0..self.partitions.len() {
                     self.partitions[p].intersection(&incoming, &mut scratch1);
                     if scratch1.is_empty() {
                         newparts.push(self.partitions[p].clone());
                         continue;
                     }

                     self.partitions[p].subtract(&incoming, &mut scratch2);
                     if scratch2.is_empty() {
                         newparts.push(self.partitions[p].clone());
                         continue;
                     }

                     let (x, y) =
                         (scratch1.deep_clone(), scratch2.deep_clone());
                     newparts.push(x.clone());
                     newparts.push(y.clone());
                     match self.find_waiting(&self.partitions[p]) {
                         Some(i) => {
                             self.waiting[i] = x;
                             self.waiting.push(y);
                         }
                         None => {
                             if x.len() <= y.len() {
                                 self.waiting.push(x);
                             } else {
                                 self.waiting.push(y);
                             }
                         }
                     }
                 }
                 newparts = mem::replace(&mut self.partitions, newparts);
                 newparts.clear();
             }
         }

         // At this point, we now have a minimal partitioning of states, where
         // each partition is an equivalence class of DFA states. Now we need to
         // use this partioning to update the DFA to only contain one state for
         // each partition.

         // Create a map from DFA state ID to the representative ID of the
         // equivalence class to which it belongs. The representative ID of an
         // equivalence class of states is the minimum ID in that class.
         let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
         for p in &self.partitions {
             p.iter(|id| state_to_part[id.to_usize()] = p.min());
         }

         // Generate a new contiguous sequence of IDs for minimal states, and
         // create a map from equivalence IDs to the new IDs. Thus, the new
         // minimal ID of *any* state in the unminimized DFA can be obtained
         // with minimals_ids[state_to_part[old_id]].
         let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
         let mut new_id = S::from_usize(0);
         for (id, _) in self.dfa.states() {
             if state_to_part[id.to_usize()] == id {
                 minimal_ids[id.to_usize()] = new_id;
                 new_id = S::from_usize(new_id.to_usize() + 1);
             }
         }
         // The total number of states in the minimal DFA.
         let minimal_count = new_id.to_usize();

         // Re-map this DFA in place such that the only states remaining
         // correspond to the representative states of every equivalence class.
         for id in (0..self.dfa.state_count()).map(S::from_usize) {
             // If this state isn't a representative for an equivalence class,
             // then we skip it since it won't appear in the minimal DFA.
             if state_to_part[id.to_usize()] != id {
                 continue;
             }
             for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
                 *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
             }
             self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
         }
         // Trim off all unused states from the pre-minimized DFA. This
         // represents all states that were merged into a non-singleton
         // equivalence class of states, and appeared after the first state
         // in each such class. (Because the state with the smallest ID in each
         // equivalence class is its representative ID.)
         self.dfa.truncate_states(minimal_count);

         // Update the new start state, which is now just the minimal ID of
         // whatever state the old start state was collapsed into.
         let old_start = self.dfa.start_state();
         self.dfa.set_start_state(
             minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
         );

         // In order to update the ID of the maximum match state, we need to
         // find the maximum ID among all of the match states in the minimized
         // DFA. This is not necessarily the new ID of the unminimized maximum
         // match state, since that could have been collapsed with a much
         // earlier match state. Therefore, to find the new max match state,
         // we iterate over all previous match states, find their corresponding
         // new minimal ID, and take the maximum of those.
         let old_max = self.dfa.max_match_state();
         self.dfa.set_max_match_state(dead_id());
         for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
             let part = state_to_part[id.to_usize()];
             let new_id = minimal_ids[part.to_usize()];
             if new_id > self.dfa.max_match_state() {
                 self.dfa.set_max_match_state(new_id);
             }
         }
     }

     fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
         self.waiting.iter().position(|s| s == set)
     }

     fn find_incoming_to(
         &self,
         b: u8,
         set: &StateSet<S>,
         incoming: &mut StateSet<S>,
     ) {
         incoming.clear();
         set.iter(|id| {
             for &inid in &self.in_transitions[id.to_usize()][b as usize] {
                 incoming.add(inid);
             }
         });
         incoming.canonicalize();
     }

     fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
         let mut is_match = StateSet::empty();
         let mut no_match = StateSet::empty();
         for (id, _) in dfa.states() {
             if dfa.is_match_state(id) {
                 is_match.add(id);
             } else {
                 no_match.add(id);
             }
         }

         let mut sets = vec![is_match];
         if !no_match.is_empty() {
             sets.push(no_match);
         }
         sets.sort_by_key(|s| s.len());
         sets
     }

     fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
         let mut incoming = vec![];
         for _ in dfa.states() {
             incoming.push(vec![vec![]; dfa.alphabet_len()]);
         }
         for (id, state) in dfa.states() {
             for (b, next) in state.transitions() {
                 incoming[next.to_usize()][b as usize].push(id);
             }
         }
         incoming
     }
 }

 impl<S: StateID> StateSet<S> {
     fn empty() -> StateSet<S> {
         StateSet(Rc::new(RefCell::new(vec![])))
     }

     fn add(&mut self, id: S) {
         self.0.borrow_mut().push(id);
     }

     fn min(&self) -> S {
         self.0.borrow()[0]
     }

     fn canonicalize(&mut self) {
         self.0.borrow_mut().sort();
         self.0.borrow_mut().dedup();
     }

     fn clear(&mut self) {
         self.0.borrow_mut().clear();
     }

     fn len(&self) -> usize {
         self.0.borrow().len()
     }

     fn is_empty(&self) -> bool {
         self.len() == 0
     }

     fn deep_clone(&self) -> StateSet<S> {
         let ids = self.0.borrow().iter().cloned().collect();
         StateSet(Rc::new(RefCell::new(ids)))
     }

     fn iter<F: FnMut(S)>(&self, mut f: F) {
         for &id in self.0.borrow().iter() {
             f(id);
         }
     }

     fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
         dest.clear();
         if self.is_empty() || other.is_empty() {
             return;
         }

         let (seta, setb) = (self.0.borrow(), other.0.borrow());
         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
         loop {
             if a == b {
                 dest.add(a);
                 a = match ita.next() {
                     None => break,
                     Some(a) => a,
                 };
                 b = match itb.next() {
                     None => break,
                     Some(b) => b,
                 };
             } else if a < b {
                 a = match ita.next() {
                     None => break,
                     Some(a) => a,
                 };
             } else {
                 b = match itb.next() {
                     None => break,
                     Some(b) => b,
                 };
             }
         }
     }

     fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
         dest.clear();
         if self.is_empty() || other.is_empty() {
             self.iter(|s| dest.add(s));
             return;
         }

         let (seta, setb) = (self.0.borrow(), other.0.borrow());
         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
         loop {
             if a == b {
                 a = match ita.next() {
                     None => break,
                     Some(a) => a,
                 };
                 b = match itb.next() {
                     None => {
                         dest.add(a);
                         break;
                     }
                     Some(b) => b,
                 };
             } else if a < b {
                 dest.add(a);
                 a = match ita.next() {
                     None => break,
                     Some(a) => a,
                 };
             } else {
                 b = match itb.next() {
                     None => {
                         dest.add(a);
                         break;
                     }
                     Some(b) => b,
                 };
             }
         }
         for a in ita {
             dest.add(a);
         }
     }
 }
	use std::cell::RefCell;
	use std::fmt;
	use std::mem;
	use std::rc::Rc;

	use dense;
	use state_id::{dead_id, StateID};

	type DFARepr<S> = dense::Repr<Vec<S>, S>;

	/// An implementation of Hopcroft's algorithm for minimizing DFAs.
	///
	/// The algorithm implemented here is mostly taken from Wikipedia:
	/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
	///
	/// This code has had some light optimization attention paid to it,
	/// particularly in the form of reducing allocation as much as possible.
	/// However, it is still generally slow. Future optimization work should
	/// probably focus on the bigger picture rather than micro-optimizations. For
	/// example:
	///
	/// 1. Figure out how to more intelligently create initial partitions. That is,
	/// Hopcroft's algorithm starts by creating two partitions of DFA states
	/// that are known to NOT be equivalent: match states and non-match states.
	/// The algorithm proceeds by progressively refining these partitions into
	/// smaller partitions. If we could start with more partitions, then we
	/// could reduce the amount of work that Hopcroft's algorithm needs to do.
	/// 2. For every partition that we visit, we find all incoming transitions to
	/// every state in the partition for every element in the alphabet. (This
	/// is why using byte classes can significantly decrease minimization times,
	/// since byte classes shrink the alphabet.) This is quite costly and there
	/// is perhaps some redundant work being performed depending on the specific
	/// states in the set. For example, we might be able to only visit some
	/// elements of the alphabet based on the transitions.
	/// 3. Move parts of minimization into determinization. If minimization has
	/// fewer states to deal with, then it should run faster. A prime example
	/// of this might be large Unicode classes, which are generated in way that
	/// can create a lot of redundant states. (Some work has been done on this
	/// point during NFA compilation via the algorithm described in the
	/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
	/// paper.)
	pub(crate) struct Minimizer<'a, S: 'a> {
	dfa: &'a mut DFARepr<S>,
	in_transitions: Vec<Vec<Vec<S>>>,
	partitions: Vec<StateSet<S>>,
	waiting: Vec<StateSet<S>>,
	}

	impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	f.debug_struct("Minimizer")
	.field("dfa", &self.dfa)
	.field("in_transitions", &self.in_transitions)
	.field("partitions", &self.partitions)
	.field("waiting", &self.waiting)
	.finish()
	}
	}

	/// A set of states. A state set makes up a single partition in Hopcroft's
	/// algorithm.
	///
	/// It is represented by an ordered set of state identifiers. We use shared
	/// ownership so that a single state set can be in both the set of partitions
	/// and in the set of waiting sets simultaneously without an additional
	/// allocation. Generally, once a state set is built, it becomes immutable.
	///
	/// We use this representation because it avoids the overhead of more
	/// traditional set data structures (HashSet/BTreeSet), and also because
	/// computing intersection/subtraction on this representation is especially
	/// fast.
	#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
	struct StateSet<S>(Rc<RefCell<Vec<S>>>);

	impl<'a, S: StateID> Minimizer<'a, S> {
	pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
	let in_transitions = Minimizer::incoming_transitions(dfa);
	let partitions = Minimizer::initial_partitions(dfa);
	let waiting = vec![partitions[0].clone()];

	Minimizer { dfa, in_transitions, partitions, waiting }
	}

	pub fn run(mut self) {
	let mut incoming = StateSet::empty();
	let mut scratch1 = StateSet::empty();
	let mut scratch2 = StateSet::empty();
	let mut newparts = vec![];

	while let Some(set) = self.waiting.pop() {
	for b in (0..self.dfa.alphabet_len()).map(\|b\| b as u8) {
	self.find_incoming_to(b, &set, &mut incoming);

	for p in 0..self.partitions.len() {
	self.partitions[p].intersection(&incoming, &mut scratch1);
	if scratch1.is_empty() {
	newparts.push(self.partitions[p].clone());
	continue;
	}

	self.partitions[p].subtract(&incoming, &mut scratch2);
	if scratch2.is_empty() {
	newparts.push(self.partitions[p].clone());
	continue;
	}

	let (x, y) =
	(scratch1.deep_clone(), scratch2.deep_clone());
	newparts.push(x.clone());
	newparts.push(y.clone());
	match self.find_waiting(&self.partitions[p]) {
	Some(i) => {
	self.waiting[i] = x;
	self.waiting.push(y);
	}
	None => {
	if x.len() <= y.len() {
	self.waiting.push(x);
	} else {
	self.waiting.push(y);
	}
	}
	}
	}
	newparts = mem::replace(&mut self.partitions, newparts);
	newparts.clear();
	}
	}

	// At this point, we now have a minimal partitioning of states, where
	// each partition is an equivalence class of DFA states. Now we need to
	// use this partioning to update the DFA to only contain one state for
	// each partition.

	// Create a map from DFA state ID to the representative ID of the
	// equivalence class to which it belongs. The representative ID of an
	// equivalence class of states is the minimum ID in that class.
	let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
	for p in &self.partitions {
	p.iter(\|id\| state_to_part[id.to_usize()] = p.min());
	}

	// Generate a new contiguous sequence of IDs for minimal states, and
	// create a map from equivalence IDs to the new IDs. Thus, the new
	// minimal ID of any state in the unminimized DFA can be obtained
	// with minimals_ids[state_to_part[old_id]].
	let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
	let mut new_id = S::from_usize(0);
	for (id, _) in self.dfa.states() {
	if state_to_part[id.to_usize()] == id {
	minimal_ids[id.to_usize()] = new_id;
	new_id = S::from_usize(new_id.to_usize() + 1);
	}
	}
	// The total number of states in the minimal DFA.
	let minimal_count = new_id.to_usize();

	// Re-map this DFA in place such that the only states remaining
	// correspond to the representative states of every equivalence class.
	for id in (0..self.dfa.state_count()).map(S::from_usize) {
	// If this state isn't a representative for an equivalence class,
	// then we skip it since it won't appear in the minimal DFA.
	if state_to_part[id.to_usize()] != id {
	continue;
	}
	for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
	*next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
	}
	self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
	}
	// Trim off all unused states from the pre-minimized DFA. This
	// represents all states that were merged into a non-singleton
	// equivalence class of states, and appeared after the first state
	// in each such class. (Because the state with the smallest ID in each
	// equivalence class is its representative ID.)
	self.dfa.truncate_states(minimal_count);

	// Update the new start state, which is now just the minimal ID of
	// whatever state the old start state was collapsed into.
	let old_start = self.dfa.start_state();
	self.dfa.set_start_state(
	minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
	);

	// In order to update the ID of the maximum match state, we need to
	// find the maximum ID among all of the match states in the minimized
	// DFA. This is not necessarily the new ID of the unminimized maximum
	// match state, since that could have been collapsed with a much
	// earlier match state. Therefore, to find the new max match state,
	// we iterate over all previous match states, find their corresponding
	// new minimal ID, and take the maximum of those.
	let old_max = self.dfa.max_match_state();
	self.dfa.set_max_match_state(dead_id());
	for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
	let part = state_to_part[id.to_usize()];
	let new_id = minimal_ids[part.to_usize()];
	if new_id > self.dfa.max_match_state() {
	self.dfa.set_max_match_state(new_id);
	}
	}
	}

	fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
	self.waiting.iter().position(\|s\| s == set)
	}

	fn find_incoming_to(
	&self,
	b: u8,
	set: &StateSet<S>,
	incoming: &mut StateSet<S>,
	) {
	incoming.clear();
	set.iter(\|id\| {
	for &inid in &self.in_transitions[id.to_usize()][b as usize] {
	incoming.add(inid);
	}
	});
	incoming.canonicalize();
	}

	fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
	let mut is_match = StateSet::empty();
	let mut no_match = StateSet::empty();
	for (id, _) in dfa.states() {
	if dfa.is_match_state(id) {
	is_match.add(id);
	} else {
	no_match.add(id);
	}
	}

	let mut sets = vec![is_match];
	if !no_match.is_empty() {
	sets.push(no_match);
	}
	sets.sort_by_key(\|s\| s.len());
	sets
	}

	fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
	let mut incoming = vec![];
	for _ in dfa.states() {
	incoming.push(vec![vec![]; dfa.alphabet_len()]);
	}
	for (id, state) in dfa.states() {
	for (b, next) in state.transitions() {
	incoming[next.to_usize()][b as usize].push(id);
	}
	}
	incoming
	}
	}

	impl<S: StateID> StateSet<S> {
	fn empty() -> StateSet<S> {
	StateSet(Rc::new(RefCell::new(vec![])))
	}

	fn add(&mut self, id: S) {
	self.0.borrow_mut().push(id);
	}

	fn min(&self) -> S {
	self.0.borrow()[0]
	}

	fn canonicalize(&mut self) {
	self.0.borrow_mut().sort();
	self.0.borrow_mut().dedup();
	}

	fn clear(&mut self) {
	self.0.borrow_mut().clear();
	}

	fn len(&self) -> usize {
	self.0.borrow().len()
	}

	fn is_empty(&self) -> bool {
	self.len() == 0
	}

	fn deep_clone(&self) -> StateSet<S> {
	let ids = self.0.borrow().iter().cloned().collect();
	StateSet(Rc::new(RefCell::new(ids)))
	}

	fn iter<F: FnMut(S)>(&self, mut f: F) {
	for &id in self.0.borrow().iter() {
	f(id);
	}
	}

	fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
	dest.clear();
	if self.is_empty() \|\| other.is_empty() {
	return;
	}

	let (seta, setb) = (self.0.borrow(), other.0.borrow());
	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
	loop {
	if a == b {
	dest.add(a);
	a = match ita.next() {
	None => break,
	Some(a) => a,
	};
	b = match itb.next() {
	None => break,
	Some(b) => b,
	};
	} else if a < b {
	a = match ita.next() {
	None => break,
	Some(a) => a,
	};
	} else {
	b = match itb.next() {
	None => break,
	Some(b) => b,
	};
	}
	}
	}

	fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
	dest.clear();
	if self.is_empty() \|\| other.is_empty() {
	self.iter(\|s\| dest.add(s));
	return;
	}

	let (seta, setb) = (self.0.borrow(), other.0.borrow());
	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
	loop {
	if a == b {
	a = match ita.next() {
	None => break,
	Some(a) => a,
	};
	b = match itb.next() {
	None => {
	dest.add(a);
	break;
	}
	Some(b) => b,
	};
	} else if a < b {
	dest.add(a);
	a = match ita.next() {
	None => break,
	Some(a) => a,
	};
	} else {
	b = match itb.next() {
	None => {
	dest.add(a);
	break;
	}
	Some(b) => b,
	};
	}
	}
	for a in ita {
	dest.add(a);
	}
	}
	}