Blame - src/dfa.rs - platform/external/rust/crates/regex-automata

blob: 43de3461fafe2b7a43d0b00bf984c751891a0717 [file] [log] [blame]

Jakub Kotur	3bceaeb	2020-12-21 17:28:16 +0100	[diff] [blame]	1	use state_id::StateID;
				2
				3	/// A trait describing the interface of a deterministic finite automaton (DFA).
				4	///
				5	/// Every DFA has exactly one start state and at least one dead state (which
				6	/// may be the same, as in the case of an empty DFA). In all cases, a state
				7	/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
				8	/// always returns `true`.
				9	///
				10	/// Every DFA also has zero or more match states, such that
				11	/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
				12	/// a match state.
				13	///
				14	/// In general, users of this trait likely will only need to use the search
				15	/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
				16	/// methods are lower level and are used for walking the transitions of a DFA
				17	/// manually. In particular, the aforementioned search routines are implemented
				18	/// generically in terms of the lower level transition walking routines.
				19	pub trait DFA {
				20	/// The representation used for state identifiers in this DFA.
				21	///
				22	/// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
				23	type ID: StateID;
				24
				25	/// Return the identifier of this DFA's start state.
				26	fn start_state(&self) -> Self::ID;
				27
				28	/// Returns true if and only if the given identifier corresponds to a match
				29	/// state.
				30	fn is_match_state(&self, id: Self::ID) -> bool;
				31
				32	/// Returns true if and only if the given identifier corresponds to a dead
				33	/// state. When a DFA enters a dead state, it is impossible to leave and
				34	/// thus can never lead to a match.
				35	fn is_dead_state(&self, id: Self::ID) -> bool;
				36
				37	/// Returns true if and only if the given identifier corresponds to either
				38	/// a dead state or a match state, such that one of `is_match_state(id)`
				39	/// or `is_dead_state(id)` must return true.
				40	///
				41	/// Depending on the implementation of the DFA, this routine can be used
				42	/// to save a branch in the core matching loop. Nevertheless,
				43	/// `is_match_state(id) \|\| is_dead_state(id)` is always a valid
				44	/// implementation.
				45	fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
				46
				47	/// Returns true if and only if this DFA is anchored.
				48	///
				49	/// When a DFA is anchored, it is only allowed to report matches that
				50	/// start at index `0`.
				51	fn is_anchored(&self) -> bool;
				52
				53	/// Given the current state that this DFA is in and the next input byte,
				54	/// this method returns the identifier of the next state. The identifier
				55	/// returned is always valid, but it may correspond to a dead state.
				56	fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
				57
				58	/// Like `next_state`, but its implementation may look up the next state
				59	/// without memory safety checks such as bounds checks. As such, callers
				60	/// must ensure that the given identifier corresponds to a valid DFA
				61	/// state. Implementors must, in turn, ensure that this routine is safe
				62	/// for all valid state identifiers and for all possible `u8` values.
				63	unsafe fn next_state_unchecked(
				64	&self,
				65	current: Self::ID,
				66	input: u8,
				67	) -> Self::ID;
				68
				69	/// Returns true if and only if the given bytes match this DFA.
				70	///
				71	/// This routine may short circuit if it knows that scanning future input
				72	/// will never lead to a different result. In particular, if a DFA enters
				73	/// a match state or a dead state, then this routine will return `true` or
				74	/// `false`, respectively, without inspecting any future input.
				75	///
				76	/// # Example
				77	///
				78	/// This example shows how to use this method with a
				79	/// [`DenseDFA`](enum.DenseDFA.html).
				80	///
				81	/// ```
				82	/// use regex_automata::{DFA, DenseDFA};
				83	///
				84	/// # fn example() -> Result<(), regex_automata::Error> {
				85	/// let dfa = DenseDFA::new("foo[0-9]+bar")?;
				86	/// assert_eq!(true, dfa.is_match(b"foo12345bar"));
				87	/// assert_eq!(false, dfa.is_match(b"foobar"));
				88	/// # Ok(()) }; example().unwrap()
				89	/// ```
				90	#[inline]
				91	fn is_match(&self, bytes: &[u8]) -> bool {
				92	self.is_match_at(bytes, 0)
				93	}
				94
				95	/// Returns the first position at which a match is found.
				96	///
				97	/// This routine stops scanning input in precisely the same circumstances
				98	/// as `is_match`. The key difference is that this routine returns the
				99	/// position at which it stopped scanning input if and only if a match
				100	/// was found. If no match is found, then `None` is returned.
				101	///
				102	/// # Example
				103	///
				104	/// This example shows how to use this method with a
				105	/// [`DenseDFA`](enum.DenseDFA.html).
				106	///
				107	/// ```
				108	/// use regex_automata::{DFA, DenseDFA};
				109	///
				110	/// # fn example() -> Result<(), regex_automata::Error> {
				111	/// let dfa = DenseDFA::new("foo[0-9]+")?;
				112	/// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
				113	///
				114	/// // Normally, the end of the leftmost first match here would be 3,
				115	/// // but the shortest match semantics detect a match earlier.
				116	/// let dfa = DenseDFA::new("abc\|a")?;
				117	/// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
				118	/// # Ok(()) }; example().unwrap()
				119	/// ```
				120	#[inline]
				121	fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
				122	self.shortest_match_at(bytes, 0)
				123	}
				124
				125	/// Returns the end offset of the longest match. If no match exists,
				126	/// then `None` is returned.
				127	///
				128	/// Implementors of this trait are not required to implement any particular
				129	/// match semantics (such as leftmost-first), which are instead manifest in
				130	/// the DFA's topology itself.
				131	///
				132	/// In particular, this method must continue searching even after it
				133	/// enters a match state. The search should only terminate once it has
				134	/// reached the end of the input or when it has entered a dead state. Upon
				135	/// termination, the position of the last byte seen while still in a match
				136	/// state is returned.
				137	///
				138	/// # Example
				139	///
				140	/// This example shows how to use this method with a
				141	/// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
				142	/// "leftmost first" match semantics.
				143	///
				144	/// Leftmost first match semantics corresponds to the match with the
				145	/// smallest starting offset, but where the end offset is determined by
				146	/// preferring earlier branches in the original regular expression. For
				147	/// example, `Sam\|Samwise` will match `Sam` in `Samwise`, but `Samwise\|Sam`
				148	/// will match `Samwise` in `Samwise`.
				149	///
				150	/// Generally speaking, the "leftmost first" match is how most backtracking
				151	/// regular expressions tend to work. This is in contrast to POSIX-style
				152	/// regular expressions that yield "leftmost longest" matches. Namely,
				153	/// both `Sam\|Samwise` and `Samwise\|Sam` match `Samwise` when using
				154	/// leftmost longest semantics.
				155	///
				156	/// ```
				157	/// use regex_automata::{DFA, DenseDFA};
				158	///
				159	/// # fn example() -> Result<(), regex_automata::Error> {
				160	/// let dfa = DenseDFA::new("foo[0-9]+")?;
				161	/// assert_eq!(Some(8), dfa.find(b"foo12345"));
				162	///
				163	/// // Even though a match is found after reading the first byte (`a`),
				164	/// // the leftmost first match semantics demand that we find the earliest
				165	/// // match that prefers earlier parts of the pattern over latter parts.
				166	/// let dfa = DenseDFA::new("abc\|a")?;
				167	/// assert_eq!(Some(3), dfa.find(b"abc"));
				168	/// # Ok(()) }; example().unwrap()
				169	/// ```
				170	#[inline]
				171	fn find(&self, bytes: &[u8]) -> Option<usize> {
				172	self.find_at(bytes, 0)
				173	}
				174
				175	/// Returns the start offset of the longest match in reverse, by searching
				176	/// from the end of the input towards the start of the input. If no match
				177	/// exists, then `None` is returned. In other words, this has the same
				178	/// match semantics as `find`, but in reverse.
				179	///
				180	/// # Example
				181	///
				182	/// This example shows how to use this method with a
				183	/// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
				184	/// is principally useful when used in conjunction with the
				185	/// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
				186	/// configuration knob. In general, it's unlikely to be correct to use both
				187	/// `find` and `rfind` with the same DFA since any particular DFA will only
				188	/// support searching in one direction.
				189	///
				190	/// ```
				191	/// use regex_automata::{dense, DFA};
				192	///
				193	/// # fn example() -> Result<(), regex_automata::Error> {
				194	/// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
				195	/// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
				196	/// # Ok(()) }; example().unwrap()
				197	/// ```
				198	#[inline]
				199	fn rfind(&self, bytes: &[u8]) -> Option<usize> {
				200	self.rfind_at(bytes, bytes.len())
				201	}
				202
				203	/// Returns the same as `is_match`, but starts the search at the given
				204	/// offset.
				205	///
				206	/// The significance of the starting point is that it takes the surrounding
				207	/// context into consideration. For example, if the DFA is anchored, then
				208	/// a match can only occur when `start == 0`.
				209	#[inline]
				210	fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
				211	if self.is_anchored() && start > 0 {
				212	return false;
				213	}
				214
				215	let mut state = self.start_state();
				216	if self.is_match_or_dead_state(state) {
				217	return self.is_match_state(state);
				218	}
				219	for &b in bytes[start..].iter() {
				220	state = unsafe { self.next_state_unchecked(state, b) };
				221	if self.is_match_or_dead_state(state) {
				222	return self.is_match_state(state);
				223	}
				224	}
				225	false
				226	}
				227
				228	/// Returns the same as `shortest_match`, but starts the search at the
				229	/// given offset.
				230	///
				231	/// The significance of the starting point is that it takes the surrounding
				232	/// context into consideration. For example, if the DFA is anchored, then
				233	/// a match can only occur when `start == 0`.
				234	#[inline]
				235	fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
				236	if self.is_anchored() && start > 0 {
				237	return None;
				238	}
				239
				240	let mut state = self.start_state();
				241	if self.is_match_or_dead_state(state) {
				242	return if self.is_dead_state(state) { None } else { Some(start) };
				243	}
				244	for (i, &b) in bytes[start..].iter().enumerate() {
				245	state = unsafe { self.next_state_unchecked(state, b) };
				246	if self.is_match_or_dead_state(state) {
				247	return if self.is_dead_state(state) {
				248	None
				249	} else {
				250	Some(start + i + 1)
				251	};
				252	}
				253	}
				254	None
				255	}
				256
				257	/// Returns the same as `find`, but starts the search at the given
				258	/// offset.
				259	///
				260	/// The significance of the starting point is that it takes the surrounding
				261	/// context into consideration. For example, if the DFA is anchored, then
				262	/// a match can only occur when `start == 0`.
				263	#[inline]
				264	fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
				265	if self.is_anchored() && start > 0 {
				266	return None;
				267	}
				268
				269	let mut state = self.start_state();
				270	let mut last_match = if self.is_dead_state(state) {
				271	return None;
				272	} else if self.is_match_state(state) {
				273	Some(start)
				274	} else {
				275	None
				276	};
				277	for (i, &b) in bytes[start..].iter().enumerate() {
				278	state = unsafe { self.next_state_unchecked(state, b) };
				279	if self.is_match_or_dead_state(state) {
				280	if self.is_dead_state(state) {
				281	return last_match;
				282	}
				283	last_match = Some(start + i + 1);
				284	}
				285	}
				286	last_match
				287	}
				288
				289	/// Returns the same as `rfind`, but starts the search at the given
				290	/// offset.
				291	///
				292	/// The significance of the starting point is that it takes the surrounding
				293	/// context into consideration. For example, if the DFA is anchored, then
				294	/// a match can only occur when `start == bytes.len()`.
				295	#[inline(never)]
				296	fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
				297	if self.is_anchored() && start < bytes.len() {
				298	return None;
				299	}
				300
				301	let mut state = self.start_state();
				302	let mut last_match = if self.is_dead_state(state) {
				303	return None;
				304	} else if self.is_match_state(state) {
				305	Some(start)
				306	} else {
				307	None
				308	};
				309	for (i, &b) in bytes[..start].iter().enumerate().rev() {
				310	state = unsafe { self.next_state_unchecked(state, b) };
				311	if self.is_match_or_dead_state(state) {
				312	if self.is_dead_state(state) {
				313	return last_match;
				314	}
				315	last_match = Some(i);
				316	}
				317	}
				318	last_match
				319	}
				320	}
				321
				322	impl<'a, T: DFA> DFA for &'a T {
				323	type ID = T::ID;
				324
				325	#[inline]
				326	fn start_state(&self) -> Self::ID {
				327	(**self).start_state()
				328	}
				329
				330	#[inline]
				331	fn is_match_state(&self, id: Self::ID) -> bool {
				332	(**self).is_match_state(id)
				333	}
				334
				335	#[inline]
				336	fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
				337	(**self).is_match_or_dead_state(id)
				338	}
				339
				340	#[inline]
				341	fn is_dead_state(&self, id: Self::ID) -> bool {
				342	(**self).is_dead_state(id)
				343	}
				344
				345	#[inline]
				346	fn is_anchored(&self) -> bool {
				347	(**self).is_anchored()
				348	}
				349
				350	#[inline]
				351	fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
				352	(**self).next_state(current, input)
				353	}
				354
				355	#[inline]
				356	unsafe fn next_state_unchecked(
				357	&self,
				358	current: Self::ID,
				359	input: u8,
				360	) -> Self::ID {
				361	(**self).next_state_unchecked(current, input)
				362	}
				363	}