Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame] | 1 | use std::collections::HashMap; |
| 2 | use std::mem; |
| 3 | use std::rc::Rc; |
| 4 | |
| 5 | use dense; |
| 6 | use error::Result; |
| 7 | use nfa::{self, NFA}; |
| 8 | use sparse_set::SparseSet; |
| 9 | use state_id::{dead_id, StateID}; |
| 10 | |
| 11 | type DFARepr<S> = dense::Repr<Vec<S>, S>; |
| 12 | |
| 13 | /// A determinizer converts an NFA to a DFA. |
| 14 | /// |
| 15 | /// This determinizer follows the typical powerset construction, where each |
| 16 | /// DFA state is comprised of one or more NFA states. In the worst case, there |
| 17 | /// is one DFA state for every possible combination of NFA states. In practice, |
| 18 | /// this only happens in certain conditions, typically when there are bounded |
| 19 | /// repetitions. |
| 20 | /// |
| 21 | /// The type variable `S` refers to the chosen state identifier representation |
| 22 | /// used for the DFA. |
| 23 | /// |
| 24 | /// The lifetime variable `'a` refers to the lifetime of the NFA being |
| 25 | /// converted to a DFA. |
| 26 | #[derive(Debug)] |
| 27 | pub(crate) struct Determinizer<'a, S: StateID> { |
| 28 | /// The NFA we're converting into a DFA. |
| 29 | nfa: &'a NFA, |
| 30 | /// The DFA we're building. |
| 31 | dfa: DFARepr<S>, |
| 32 | /// Each DFA state being built is defined as an *ordered* set of NFA |
| 33 | /// states, along with a flag indicating whether the state is a match |
| 34 | /// state or not. |
| 35 | /// |
| 36 | /// This is never empty. The first state is always a dummy state such that |
| 37 | /// a state id == 0 corresponds to a dead state. |
| 38 | builder_states: Vec<Rc<State>>, |
| 39 | /// A cache of DFA states that already exist and can be easily looked up |
| 40 | /// via ordered sets of NFA states. |
| 41 | cache: HashMap<Rc<State>, S>, |
| 42 | /// Scratch space for a stack of NFA states to visit, for depth first |
| 43 | /// visiting without recursion. |
| 44 | stack: Vec<nfa::StateID>, |
| 45 | /// Scratch space for storing an ordered sequence of NFA states, for |
| 46 | /// amortizing allocation. |
| 47 | scratch_nfa_states: Vec<nfa::StateID>, |
| 48 | /// Whether to build a DFA that finds the longest possible match. |
| 49 | longest_match: bool, |
| 50 | } |
| 51 | |
| 52 | /// An intermediate representation for a DFA state during determinization. |
| 53 | #[derive(Debug, Eq, Hash, PartialEq)] |
| 54 | struct State { |
| 55 | /// Whether this state is a match state or not. |
| 56 | is_match: bool, |
| 57 | /// An ordered sequence of NFA states that make up this DFA state. |
| 58 | nfa_states: Vec<nfa::StateID>, |
| 59 | } |
| 60 | |
| 61 | impl<'a, S: StateID> Determinizer<'a, S> { |
| 62 | /// Create a new determinizer for converting the given NFA to a DFA. |
| 63 | pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { |
| 64 | let dead = Rc::new(State::dead()); |
| 65 | let mut cache = HashMap::default(); |
| 66 | cache.insert(dead.clone(), dead_id()); |
| 67 | |
| 68 | Determinizer { |
| 69 | nfa, |
| 70 | dfa: DFARepr::empty().anchored(nfa.is_anchored()), |
| 71 | builder_states: vec![dead], |
| 72 | cache, |
| 73 | stack: vec![], |
| 74 | scratch_nfa_states: vec![], |
| 75 | longest_match: false, |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | /// Instruct the determinizer to use equivalence classes as the transition |
| 80 | /// alphabet instead of all possible byte values. |
| 81 | pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { |
| 82 | let byte_classes = self.nfa.byte_classes().clone(); |
| 83 | self.dfa = DFARepr::empty_with_byte_classes(byte_classes) |
| 84 | .anchored(self.nfa.is_anchored()); |
| 85 | self |
| 86 | } |
| 87 | |
| 88 | /// Instruct the determinizer to build a DFA that recognizes the longest |
| 89 | /// possible match instead of the leftmost first match. This is useful when |
| 90 | /// constructing reverse DFAs for finding the start of a match. |
| 91 | pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { |
| 92 | self.longest_match = yes; |
| 93 | self |
| 94 | } |
| 95 | |
| 96 | /// Build the DFA. If there was a problem constructing the DFA (e.g., if |
| 97 | /// the chosen state identifier representation is too small), then an error |
| 98 | /// is returned. |
| 99 | pub fn build(mut self) -> Result<DFARepr<S>> { |
| 100 | let representative_bytes: Vec<u8> = |
| 101 | self.dfa.byte_classes().representatives().collect(); |
| 102 | let mut sparse = self.new_sparse_set(); |
| 103 | let mut uncompiled = vec![self.add_start(&mut sparse)?]; |
| 104 | while let Some(dfa_id) = uncompiled.pop() { |
| 105 | for &b in &representative_bytes { |
| 106 | let (next_dfa_id, is_new) = |
| 107 | self.cached_state(dfa_id, b, &mut sparse)?; |
| 108 | self.dfa.add_transition(dfa_id, b, next_dfa_id); |
| 109 | if is_new { |
| 110 | uncompiled.push(next_dfa_id); |
| 111 | } |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | // At this point, we shuffle the matching states in the final DFA to |
| 116 | // the beginning. This permits a DFA's match loop to detect a match |
| 117 | // condition by merely inspecting the current state's identifier, and |
| 118 | // avoids the need for any additional auxiliary storage. |
| 119 | let is_match: Vec<bool> = |
| 120 | self.builder_states.iter().map(|s| s.is_match).collect(); |
| 121 | self.dfa.shuffle_match_states(&is_match); |
| 122 | Ok(self.dfa) |
| 123 | } |
| 124 | |
| 125 | /// Return the identifier for the next DFA state given an existing DFA |
| 126 | /// state and an input byte. If the next DFA state already exists, then |
| 127 | /// return its identifier from the cache. Otherwise, build the state, cache |
| 128 | /// it and return its identifier. |
| 129 | /// |
| 130 | /// The given sparse set is used for scratch space. It must have a capacity |
| 131 | /// equivalent to the total number of NFA states, but its contents are |
| 132 | /// otherwise unspecified. |
| 133 | /// |
| 134 | /// This routine returns a boolean indicating whether a new state was |
| 135 | /// built. If a new state is built, then the caller needs to add it to its |
| 136 | /// frontier of uncompiled DFA states to compute transitions for. |
| 137 | fn cached_state( |
| 138 | &mut self, |
| 139 | dfa_id: S, |
| 140 | b: u8, |
| 141 | sparse: &mut SparseSet, |
| 142 | ) -> Result<(S, bool)> { |
| 143 | sparse.clear(); |
| 144 | // Compute the set of all reachable NFA states, including epsilons. |
| 145 | self.next(dfa_id, b, sparse); |
| 146 | // Build a candidate state and check if it has already been built. |
| 147 | let state = self.new_state(sparse); |
| 148 | if let Some(&cached_id) = self.cache.get(&state) { |
| 149 | // Since we have a cached state, put the constructed state's |
| 150 | // memory back into our scratch space, so that it can be reused. |
Joel Galenson | 8249a3d | 2021-06-21 14:01:00 -0700 | [diff] [blame] | 151 | let _ = |
| 152 | mem::replace(&mut self.scratch_nfa_states, state.nfa_states); |
Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame] | 153 | return Ok((cached_id, false)); |
| 154 | } |
| 155 | // Nothing was in the cache, so add this state to the cache. |
| 156 | self.add_state(state).map(|s| (s, true)) |
| 157 | } |
| 158 | |
| 159 | /// Compute the set of all eachable NFA states, including the full epsilon |
| 160 | /// closure, from a DFA state for a single byte of input. |
| 161 | fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { |
| 162 | next_nfa_states.clear(); |
| 163 | for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { |
| 164 | let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; |
| 165 | match *self.nfa.state(nfa_id) { |
| 166 | nfa::State::Union { .. } |
| 167 | | nfa::State::Fail |
| 168 | | nfa::State::Match => {} |
| 169 | nfa::State::Range { range: ref r } => { |
| 170 | if r.start <= b && b <= r.end { |
| 171 | self.epsilon_closure(r.next, next_nfa_states); |
| 172 | } |
| 173 | } |
| 174 | nfa::State::Sparse { ref ranges } => { |
| 175 | for r in ranges.iter() { |
| 176 | if r.start > b { |
| 177 | break; |
| 178 | } else if r.start <= b && b <= r.end { |
| 179 | self.epsilon_closure(r.next, next_nfa_states); |
| 180 | break; |
| 181 | } |
| 182 | } |
| 183 | } |
| 184 | } |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | /// Compute the epsilon closure for the given NFA state. |
| 189 | fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { |
| 190 | if !self.nfa.state(start).is_epsilon() { |
| 191 | set.insert(start); |
| 192 | return; |
| 193 | } |
| 194 | |
| 195 | self.stack.push(start); |
| 196 | while let Some(mut id) = self.stack.pop() { |
| 197 | loop { |
| 198 | if set.contains(id) { |
| 199 | break; |
| 200 | } |
| 201 | set.insert(id); |
| 202 | match *self.nfa.state(id) { |
| 203 | nfa::State::Range { .. } |
| 204 | | nfa::State::Sparse { .. } |
| 205 | | nfa::State::Fail |
| 206 | | nfa::State::Match => break, |
| 207 | nfa::State::Union { ref alternates } => { |
| 208 | id = match alternates.get(0) { |
| 209 | None => break, |
| 210 | Some(&id) => id, |
| 211 | }; |
| 212 | self.stack.extend(alternates[1..].iter().rev()); |
| 213 | } |
| 214 | } |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | /// Compute the initial DFA state and return its identifier. |
| 220 | /// |
| 221 | /// The sparse set given is used for scratch space, and must have capacity |
| 222 | /// equal to the total number of NFA states. Its contents are unspecified. |
| 223 | fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { |
| 224 | sparse.clear(); |
| 225 | self.epsilon_closure(self.nfa.start(), sparse); |
| 226 | let state = self.new_state(&sparse); |
| 227 | let id = self.add_state(state)?; |
| 228 | self.dfa.set_start_state(id); |
| 229 | Ok(id) |
| 230 | } |
| 231 | |
| 232 | /// Add the given state to the DFA and make it available in the cache. |
| 233 | /// |
| 234 | /// The state initially has no transitions. That is, it transitions to the |
| 235 | /// dead state for all possible inputs. |
| 236 | fn add_state(&mut self, state: State) -> Result<S> { |
| 237 | let id = self.dfa.add_empty_state()?; |
| 238 | let rstate = Rc::new(state); |
| 239 | self.builder_states.push(rstate.clone()); |
| 240 | self.cache.insert(rstate, id); |
| 241 | Ok(id) |
| 242 | } |
| 243 | |
| 244 | /// Convert the given set of ordered NFA states to a DFA state. |
| 245 | fn new_state(&mut self, set: &SparseSet) -> State { |
| 246 | let mut state = State { |
| 247 | is_match: false, |
| 248 | nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), |
| 249 | }; |
| 250 | state.nfa_states.clear(); |
| 251 | |
| 252 | for &id in set { |
| 253 | match *self.nfa.state(id) { |
| 254 | nfa::State::Range { .. } => { |
| 255 | state.nfa_states.push(id); |
| 256 | } |
| 257 | nfa::State::Sparse { .. } => { |
| 258 | state.nfa_states.push(id); |
| 259 | } |
| 260 | nfa::State::Fail => { |
| 261 | break; |
| 262 | } |
| 263 | nfa::State::Match => { |
| 264 | state.is_match = true; |
| 265 | if !self.longest_match { |
| 266 | break; |
| 267 | } |
| 268 | } |
| 269 | nfa::State::Union { .. } => {} |
| 270 | } |
| 271 | } |
| 272 | state |
| 273 | } |
| 274 | |
| 275 | /// Create a new sparse set with enough capacity to hold all NFA states. |
| 276 | fn new_sparse_set(&self) -> SparseSet { |
| 277 | SparseSet::new(self.nfa.len()) |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | impl State { |
| 282 | /// Create a new empty dead state. |
| 283 | fn dead() -> State { |
| 284 | State { nfa_states: vec![], is_match: false } |
| 285 | } |
| 286 | } |