Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame] | 1 | use state_id::StateID; |
| 2 | |
| 3 | /// A trait describing the interface of a deterministic finite automaton (DFA). |
| 4 | /// |
| 5 | /// Every DFA has exactly one start state and at least one dead state (which |
| 6 | /// may be the same, as in the case of an empty DFA). In all cases, a state |
| 7 | /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` |
| 8 | /// always returns `true`. |
| 9 | /// |
| 10 | /// Every DFA also has zero or more match states, such that |
| 11 | /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to |
| 12 | /// a match state. |
| 13 | /// |
| 14 | /// In general, users of this trait likely will only need to use the search |
| 15 | /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other |
| 16 | /// methods are lower level and are used for walking the transitions of a DFA |
| 17 | /// manually. In particular, the aforementioned search routines are implemented |
| 18 | /// generically in terms of the lower level transition walking routines. |
| 19 | pub trait DFA { |
| 20 | /// The representation used for state identifiers in this DFA. |
| 21 | /// |
| 22 | /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. |
| 23 | type ID: StateID; |
| 24 | |
| 25 | /// Return the identifier of this DFA's start state. |
| 26 | fn start_state(&self) -> Self::ID; |
| 27 | |
| 28 | /// Returns true if and only if the given identifier corresponds to a match |
| 29 | /// state. |
| 30 | fn is_match_state(&self, id: Self::ID) -> bool; |
| 31 | |
| 32 | /// Returns true if and only if the given identifier corresponds to a dead |
| 33 | /// state. When a DFA enters a dead state, it is impossible to leave and |
| 34 | /// thus can never lead to a match. |
| 35 | fn is_dead_state(&self, id: Self::ID) -> bool; |
| 36 | |
| 37 | /// Returns true if and only if the given identifier corresponds to either |
| 38 | /// a dead state or a match state, such that one of `is_match_state(id)` |
| 39 | /// or `is_dead_state(id)` must return true. |
| 40 | /// |
| 41 | /// Depending on the implementation of the DFA, this routine can be used |
| 42 | /// to save a branch in the core matching loop. Nevertheless, |
| 43 | /// `is_match_state(id) || is_dead_state(id)` is always a valid |
| 44 | /// implementation. |
| 45 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool; |
| 46 | |
| 47 | /// Returns true if and only if this DFA is anchored. |
| 48 | /// |
| 49 | /// When a DFA is anchored, it is only allowed to report matches that |
| 50 | /// start at index `0`. |
| 51 | fn is_anchored(&self) -> bool; |
| 52 | |
| 53 | /// Given the current state that this DFA is in and the next input byte, |
| 54 | /// this method returns the identifier of the next state. The identifier |
| 55 | /// returned is always valid, but it may correspond to a dead state. |
| 56 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; |
| 57 | |
| 58 | /// Like `next_state`, but its implementation may look up the next state |
| 59 | /// without memory safety checks such as bounds checks. As such, callers |
| 60 | /// must ensure that the given identifier corresponds to a valid DFA |
| 61 | /// state. Implementors must, in turn, ensure that this routine is safe |
| 62 | /// for all valid state identifiers and for all possible `u8` values. |
| 63 | unsafe fn next_state_unchecked( |
| 64 | &self, |
| 65 | current: Self::ID, |
| 66 | input: u8, |
| 67 | ) -> Self::ID; |
| 68 | |
| 69 | /// Returns true if and only if the given bytes match this DFA. |
| 70 | /// |
| 71 | /// This routine may short circuit if it knows that scanning future input |
| 72 | /// will never lead to a different result. In particular, if a DFA enters |
| 73 | /// a match state or a dead state, then this routine will return `true` or |
| 74 | /// `false`, respectively, without inspecting any future input. |
| 75 | /// |
| 76 | /// # Example |
| 77 | /// |
| 78 | /// This example shows how to use this method with a |
| 79 | /// [`DenseDFA`](enum.DenseDFA.html). |
| 80 | /// |
| 81 | /// ``` |
| 82 | /// use regex_automata::{DFA, DenseDFA}; |
| 83 | /// |
| 84 | /// # fn example() -> Result<(), regex_automata::Error> { |
| 85 | /// let dfa = DenseDFA::new("foo[0-9]+bar")?; |
| 86 | /// assert_eq!(true, dfa.is_match(b"foo12345bar")); |
| 87 | /// assert_eq!(false, dfa.is_match(b"foobar")); |
| 88 | /// # Ok(()) }; example().unwrap() |
| 89 | /// ``` |
| 90 | #[inline] |
| 91 | fn is_match(&self, bytes: &[u8]) -> bool { |
| 92 | self.is_match_at(bytes, 0) |
| 93 | } |
| 94 | |
| 95 | /// Returns the first position at which a match is found. |
| 96 | /// |
| 97 | /// This routine stops scanning input in precisely the same circumstances |
| 98 | /// as `is_match`. The key difference is that this routine returns the |
| 99 | /// position at which it stopped scanning input if and only if a match |
| 100 | /// was found. If no match is found, then `None` is returned. |
| 101 | /// |
| 102 | /// # Example |
| 103 | /// |
| 104 | /// This example shows how to use this method with a |
| 105 | /// [`DenseDFA`](enum.DenseDFA.html). |
| 106 | /// |
| 107 | /// ``` |
| 108 | /// use regex_automata::{DFA, DenseDFA}; |
| 109 | /// |
| 110 | /// # fn example() -> Result<(), regex_automata::Error> { |
| 111 | /// let dfa = DenseDFA::new("foo[0-9]+")?; |
| 112 | /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); |
| 113 | /// |
| 114 | /// // Normally, the end of the leftmost first match here would be 3, |
| 115 | /// // but the shortest match semantics detect a match earlier. |
| 116 | /// let dfa = DenseDFA::new("abc|a")?; |
| 117 | /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); |
| 118 | /// # Ok(()) }; example().unwrap() |
| 119 | /// ``` |
| 120 | #[inline] |
| 121 | fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { |
| 122 | self.shortest_match_at(bytes, 0) |
| 123 | } |
| 124 | |
| 125 | /// Returns the end offset of the longest match. If no match exists, |
| 126 | /// then `None` is returned. |
| 127 | /// |
| 128 | /// Implementors of this trait are not required to implement any particular |
| 129 | /// match semantics (such as leftmost-first), which are instead manifest in |
| 130 | /// the DFA's topology itself. |
| 131 | /// |
| 132 | /// In particular, this method must continue searching even after it |
| 133 | /// enters a match state. The search should only terminate once it has |
| 134 | /// reached the end of the input or when it has entered a dead state. Upon |
| 135 | /// termination, the position of the last byte seen while still in a match |
| 136 | /// state is returned. |
| 137 | /// |
| 138 | /// # Example |
| 139 | /// |
| 140 | /// This example shows how to use this method with a |
| 141 | /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses |
| 142 | /// "leftmost first" match semantics. |
| 143 | /// |
| 144 | /// Leftmost first match semantics corresponds to the match with the |
| 145 | /// smallest starting offset, but where the end offset is determined by |
| 146 | /// preferring earlier branches in the original regular expression. For |
| 147 | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
| 148 | /// will match `Samwise` in `Samwise`. |
| 149 | /// |
| 150 | /// Generally speaking, the "leftmost first" match is how most backtracking |
| 151 | /// regular expressions tend to work. This is in contrast to POSIX-style |
| 152 | /// regular expressions that yield "leftmost longest" matches. Namely, |
| 153 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
| 154 | /// leftmost longest semantics. |
| 155 | /// |
| 156 | /// ``` |
| 157 | /// use regex_automata::{DFA, DenseDFA}; |
| 158 | /// |
| 159 | /// # fn example() -> Result<(), regex_automata::Error> { |
| 160 | /// let dfa = DenseDFA::new("foo[0-9]+")?; |
| 161 | /// assert_eq!(Some(8), dfa.find(b"foo12345")); |
| 162 | /// |
| 163 | /// // Even though a match is found after reading the first byte (`a`), |
| 164 | /// // the leftmost first match semantics demand that we find the earliest |
| 165 | /// // match that prefers earlier parts of the pattern over latter parts. |
| 166 | /// let dfa = DenseDFA::new("abc|a")?; |
| 167 | /// assert_eq!(Some(3), dfa.find(b"abc")); |
| 168 | /// # Ok(()) }; example().unwrap() |
| 169 | /// ``` |
| 170 | #[inline] |
| 171 | fn find(&self, bytes: &[u8]) -> Option<usize> { |
| 172 | self.find_at(bytes, 0) |
| 173 | } |
| 174 | |
| 175 | /// Returns the start offset of the longest match in reverse, by searching |
| 176 | /// from the end of the input towards the start of the input. If no match |
| 177 | /// exists, then `None` is returned. In other words, this has the same |
| 178 | /// match semantics as `find`, but in reverse. |
| 179 | /// |
| 180 | /// # Example |
| 181 | /// |
| 182 | /// This example shows how to use this method with a |
| 183 | /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine |
| 184 | /// is principally useful when used in conjunction with the |
| 185 | /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) |
| 186 | /// configuration knob. In general, it's unlikely to be correct to use both |
| 187 | /// `find` and `rfind` with the same DFA since any particular DFA will only |
| 188 | /// support searching in one direction. |
| 189 | /// |
| 190 | /// ``` |
| 191 | /// use regex_automata::{dense, DFA}; |
| 192 | /// |
| 193 | /// # fn example() -> Result<(), regex_automata::Error> { |
| 194 | /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; |
| 195 | /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); |
| 196 | /// # Ok(()) }; example().unwrap() |
| 197 | /// ``` |
| 198 | #[inline] |
| 199 | fn rfind(&self, bytes: &[u8]) -> Option<usize> { |
| 200 | self.rfind_at(bytes, bytes.len()) |
| 201 | } |
| 202 | |
| 203 | /// Returns the same as `is_match`, but starts the search at the given |
| 204 | /// offset. |
| 205 | /// |
| 206 | /// The significance of the starting point is that it takes the surrounding |
| 207 | /// context into consideration. For example, if the DFA is anchored, then |
| 208 | /// a match can only occur when `start == 0`. |
| 209 | #[inline] |
| 210 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |
| 211 | if self.is_anchored() && start > 0 { |
| 212 | return false; |
| 213 | } |
| 214 | |
| 215 | let mut state = self.start_state(); |
| 216 | if self.is_match_or_dead_state(state) { |
| 217 | return self.is_match_state(state); |
| 218 | } |
| 219 | for &b in bytes[start..].iter() { |
| 220 | state = unsafe { self.next_state_unchecked(state, b) }; |
| 221 | if self.is_match_or_dead_state(state) { |
| 222 | return self.is_match_state(state); |
| 223 | } |
| 224 | } |
| 225 | false |
| 226 | } |
| 227 | |
| 228 | /// Returns the same as `shortest_match`, but starts the search at the |
| 229 | /// given offset. |
| 230 | /// |
| 231 | /// The significance of the starting point is that it takes the surrounding |
| 232 | /// context into consideration. For example, if the DFA is anchored, then |
| 233 | /// a match can only occur when `start == 0`. |
| 234 | #[inline] |
| 235 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
| 236 | if self.is_anchored() && start > 0 { |
| 237 | return None; |
| 238 | } |
| 239 | |
| 240 | let mut state = self.start_state(); |
| 241 | if self.is_match_or_dead_state(state) { |
| 242 | return if self.is_dead_state(state) { None } else { Some(start) }; |
| 243 | } |
| 244 | for (i, &b) in bytes[start..].iter().enumerate() { |
| 245 | state = unsafe { self.next_state_unchecked(state, b) }; |
| 246 | if self.is_match_or_dead_state(state) { |
| 247 | return if self.is_dead_state(state) { |
| 248 | None |
| 249 | } else { |
| 250 | Some(start + i + 1) |
| 251 | }; |
| 252 | } |
| 253 | } |
| 254 | None |
| 255 | } |
| 256 | |
| 257 | /// Returns the same as `find`, but starts the search at the given |
| 258 | /// offset. |
| 259 | /// |
| 260 | /// The significance of the starting point is that it takes the surrounding |
| 261 | /// context into consideration. For example, if the DFA is anchored, then |
| 262 | /// a match can only occur when `start == 0`. |
| 263 | #[inline] |
| 264 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
| 265 | if self.is_anchored() && start > 0 { |
| 266 | return None; |
| 267 | } |
| 268 | |
| 269 | let mut state = self.start_state(); |
| 270 | let mut last_match = if self.is_dead_state(state) { |
| 271 | return None; |
| 272 | } else if self.is_match_state(state) { |
| 273 | Some(start) |
| 274 | } else { |
| 275 | None |
| 276 | }; |
| 277 | for (i, &b) in bytes[start..].iter().enumerate() { |
| 278 | state = unsafe { self.next_state_unchecked(state, b) }; |
| 279 | if self.is_match_or_dead_state(state) { |
| 280 | if self.is_dead_state(state) { |
| 281 | return last_match; |
| 282 | } |
| 283 | last_match = Some(start + i + 1); |
| 284 | } |
| 285 | } |
| 286 | last_match |
| 287 | } |
| 288 | |
| 289 | /// Returns the same as `rfind`, but starts the search at the given |
| 290 | /// offset. |
| 291 | /// |
| 292 | /// The significance of the starting point is that it takes the surrounding |
| 293 | /// context into consideration. For example, if the DFA is anchored, then |
| 294 | /// a match can only occur when `start == bytes.len()`. |
| 295 | #[inline(never)] |
| 296 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
| 297 | if self.is_anchored() && start < bytes.len() { |
| 298 | return None; |
| 299 | } |
| 300 | |
| 301 | let mut state = self.start_state(); |
| 302 | let mut last_match = if self.is_dead_state(state) { |
| 303 | return None; |
| 304 | } else if self.is_match_state(state) { |
| 305 | Some(start) |
| 306 | } else { |
| 307 | None |
| 308 | }; |
| 309 | for (i, &b) in bytes[..start].iter().enumerate().rev() { |
| 310 | state = unsafe { self.next_state_unchecked(state, b) }; |
| 311 | if self.is_match_or_dead_state(state) { |
| 312 | if self.is_dead_state(state) { |
| 313 | return last_match; |
| 314 | } |
| 315 | last_match = Some(i); |
| 316 | } |
| 317 | } |
| 318 | last_match |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | impl<'a, T: DFA> DFA for &'a T { |
| 323 | type ID = T::ID; |
| 324 | |
| 325 | #[inline] |
| 326 | fn start_state(&self) -> Self::ID { |
| 327 | (**self).start_state() |
| 328 | } |
| 329 | |
| 330 | #[inline] |
| 331 | fn is_match_state(&self, id: Self::ID) -> bool { |
| 332 | (**self).is_match_state(id) |
| 333 | } |
| 334 | |
| 335 | #[inline] |
| 336 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool { |
| 337 | (**self).is_match_or_dead_state(id) |
| 338 | } |
| 339 | |
| 340 | #[inline] |
| 341 | fn is_dead_state(&self, id: Self::ID) -> bool { |
| 342 | (**self).is_dead_state(id) |
| 343 | } |
| 344 | |
| 345 | #[inline] |
| 346 | fn is_anchored(&self) -> bool { |
| 347 | (**self).is_anchored() |
| 348 | } |
| 349 | |
| 350 | #[inline] |
| 351 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { |
| 352 | (**self).next_state(current, input) |
| 353 | } |
| 354 | |
| 355 | #[inline] |
| 356 | unsafe fn next_state_unchecked( |
| 357 | &self, |
| 358 | current: Self::ID, |
| 359 | input: u8, |
| 360 | ) -> Self::ID { |
| 361 | (**self).next_state_unchecked(current, input) |
| 362 | } |
| 363 | } |