Blame - src/regex.rs - platform/external/rust/crates/regex-automata

blob: 47e1c58190c6217f93855c6a263233e44474b6f3 [file] [log] [blame]

Jakub Kotur	3bceaeb	2020-12-21 17:28:16 +0100	[diff] [blame]	1	#[cfg(feature = "std")]
				2	use dense::{self, DenseDFA};
				3	use dfa::DFA;
				4	#[cfg(feature = "std")]
				5	use error::Result;
				6	#[cfg(feature = "std")]
				7	use sparse::SparseDFA;
				8	#[cfg(feature = "std")]
				9	use state_id::StateID;
				10
				11	/// A regular expression that uses deterministic finite automata for fast
				12	/// searching.
				13	///
				14	/// A regular expression is comprised of two DFAs, a "forward" DFA and a
				15	/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
				16	/// match while the reverse DFA is responsible for detecting the start of a
				17	/// match. Thus, in order to find the bounds of any given match, a forward
				18	/// search must first be run followed by a reverse search. A match found by
				19	/// the forward DFA guarantees that the reverse DFA will also find a match.
				20	///
				21	/// The type of the DFA used by a `Regex` corresponds to the `D` type
				22	/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
				23	/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
				24	/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
				25	/// search faster, while sparse DFAs use less memory but search more slowly.
				26	///
				27	/// By default, a regex's DFA type parameter is set to
				28	/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
				29	/// most convenient type that gives the best search performance.
				30	///
				31	/// # Sparse DFAs
				32	///
				33	/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
				34	/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
				35	/// enough to build corresponding sparse DFAs, and then build a regex from
				36	/// them:
				37	///
				38	/// ```
				39	/// use regex_automata::Regex;
				40	///
				41	/// # fn example() -> Result<(), regex_automata::Error> {
				42	/// // First, build a regex that uses dense DFAs.
				43	/// let dense_re = Regex::new("foo[0-9]+")?;
				44	///
				45	/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
				46	/// let fwd = dense_re.forward().to_sparse()?;
				47	/// let rev = dense_re.reverse().to_sparse()?;
				48	///
				49	/// // Third, build a new regex from the constituent sparse DFAs.
				50	/// let sparse_re = Regex::from_dfas(fwd, rev);
				51	///
				52	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
				53	/// assert_eq!(true, sparse_re.is_match(b"foo123"));
				54	/// # Ok(()) }; example().unwrap()
				55	/// ```
				56	#[cfg(feature = "std")]
				57	#[derive(Clone, Debug)]
				58	pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
				59	forward: D,
				60	reverse: D,
				61	}
				62
				63	/// A regular expression that uses deterministic finite automata for fast
				64	/// searching.
				65	///
				66	/// A regular expression is comprised of two DFAs, a "forward" DFA and a
				67	/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
				68	/// match while the reverse DFA is responsible for detecting the start of a
				69	/// match. Thus, in order to find the bounds of any given match, a forward
				70	/// search must first be run followed by a reverse search. A match found by
				71	/// the forward DFA guarantees that the reverse DFA will also find a match.
				72	///
				73	/// The type of the DFA used by a `Regex` corresponds to the `D` type
				74	/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
				75	/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
				76	/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
				77	/// search faster, while sparse DFAs use less memory but search more slowly.
				78	///
				79	/// When using this crate without the standard library, the `Regex` type has
				80	/// no default type parameter.
				81	///
				82	/// # Sparse DFAs
				83	///
				84	/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
				85	/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
				86	/// enough to build corresponding sparse DFAs, and then build a regex from
				87	/// them:
				88	///
				89	/// ```
				90	/// use regex_automata::Regex;
				91	///
				92	/// # fn example() -> Result<(), regex_automata::Error> {
				93	/// // First, build a regex that uses dense DFAs.
				94	/// let dense_re = Regex::new("foo[0-9]+")?;
				95	///
				96	/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
				97	/// let fwd = dense_re.forward().to_sparse()?;
				98	/// let rev = dense_re.reverse().to_sparse()?;
				99	///
				100	/// // Third, build a new regex from the constituent sparse DFAs.
				101	/// let sparse_re = Regex::from_dfas(fwd, rev);
				102	///
				103	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
				104	/// assert_eq!(true, sparse_re.is_match(b"foo123"));
				105	/// # Ok(()) }; example().unwrap()
				106	/// ```
				107	#[cfg(not(feature = "std"))]
				108	#[derive(Clone, Debug)]
				109	pub struct Regex<D> {
				110	forward: D,
				111	reverse: D,
				112	}
				113
				114	#[cfg(feature = "std")]
				115	impl Regex {
				116	/// Parse the given regular expression using a default configuration and
				117	/// return the corresponding regex.
				118	///
				119	/// The default configuration uses `usize` for state IDs, premultiplies
				120	/// them and reduces the alphabet size by splitting bytes into equivalence
				121	/// classes. The underlying DFAs are not minimized.
				122	///
				123	/// If you want a non-default configuration, then use the
				124	/// [`RegexBuilder`](struct.RegexBuilder.html)
				125	/// to set your own configuration.
				126	///
				127	/// # Example
				128	///
				129	/// ```
				130	/// use regex_automata::Regex;
				131	///
				132	/// # fn example() -> Result<(), regex_automata::Error> {
				133	/// let re = Regex::new("foo[0-9]+bar")?;
				134	/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
				135	/// # Ok(()) }; example().unwrap()
				136	/// ```
				137	pub fn new(pattern: &str) -> Result<Regex> {
				138	RegexBuilder::new().build(pattern)
				139	}
				140	}
				141
				142	#[cfg(feature = "std")]
				143	impl Regex<SparseDFA<Vec<u8>, usize>> {
				144	/// Parse the given regular expression using a default configuration and
				145	/// return the corresponding regex using sparse DFAs.
				146	///
				147	/// The default configuration uses `usize` for state IDs, reduces the
				148	/// alphabet size by splitting bytes into equivalence classes. The
				149	/// underlying DFAs are not minimized.
				150	///
				151	/// If you want a non-default configuration, then use the
				152	/// [`RegexBuilder`](struct.RegexBuilder.html)
				153	/// to set your own configuration.
				154	///
				155	/// # Example
				156	///
				157	/// ```
				158	/// use regex_automata::Regex;
				159	///
				160	/// # fn example() -> Result<(), regex_automata::Error> {
				161	/// let re = Regex::new_sparse("foo[0-9]+bar")?;
				162	/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
				163	/// # Ok(()) }; example().unwrap()
				164	/// ```
				165	pub fn new_sparse(
				166	pattern: &str,
				167	) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
				168	RegexBuilder::new().build_sparse(pattern)
				169	}
				170	}
				171
				172	impl<D: DFA> Regex<D> {
				173	/// Returns true if and only if the given bytes match.
				174	///
				175	/// This routine may short circuit if it knows that scanning future input
				176	/// will never lead to a different result. In particular, if the underlying
				177	/// DFA enters a match state or a dead state, then this routine will return
				178	/// `true` or `false`, respectively, without inspecting any future input.
				179	///
				180	/// # Example
				181	///
				182	/// ```
				183	/// use regex_automata::Regex;
				184	///
				185	/// # fn example() -> Result<(), regex_automata::Error> {
				186	/// let re = Regex::new("foo[0-9]+bar")?;
				187	/// assert_eq!(true, re.is_match(b"foo12345bar"));
				188	/// assert_eq!(false, re.is_match(b"foobar"));
				189	/// # Ok(()) }; example().unwrap()
				190	/// ```
				191	pub fn is_match(&self, input: &[u8]) -> bool {
				192	self.is_match_at(input, 0)
				193	}
				194
				195	/// Returns the first position at which a match is found.
				196	///
				197	/// This routine stops scanning input in precisely the same circumstances
				198	/// as `is_match`. The key difference is that this routine returns the
				199	/// position at which it stopped scanning input if and only if a match
				200	/// was found. If no match is found, then `None` is returned.
				201	///
				202	/// # Example
				203	///
				204	/// ```
				205	/// use regex_automata::Regex;
				206	///
				207	/// # fn example() -> Result<(), regex_automata::Error> {
				208	/// let re = Regex::new("foo[0-9]+")?;
				209	/// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
				210	///
				211	/// // Normally, the end of the leftmost first match here would be 3,
				212	/// // but the shortest match semantics detect a match earlier.
				213	/// let re = Regex::new("abc\|a")?;
				214	/// assert_eq!(Some(1), re.shortest_match(b"abc"));
				215	/// # Ok(()) }; example().unwrap()
				216	/// ```
				217	pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
				218	self.shortest_match_at(input, 0)
				219	}
				220
				221	/// Returns the start and end offset of the leftmost first match. If no
				222	/// match exists, then `None` is returned.
				223	///
				224	/// The "leftmost first" match corresponds to the match with the smallest
				225	/// starting offset, but where the end offset is determined by preferring
				226	/// earlier branches in the original regular expression. For example,
				227	/// `Sam\|Samwise` will match `Sam` in `Samwise`, but `Samwise\|Sam` will
				228	/// match `Samwise` in `Samwise`.
				229	///
				230	/// Generally speaking, the "leftmost first" match is how most backtracking
				231	/// regular expressions tend to work. This is in contrast to POSIX-style
				232	/// regular expressions that yield "leftmost longest" matches. Namely,
				233	/// both `Sam\|Samwise` and `Samwise\|Sam` match `Samwise` when using
				234	/// leftmost longest semantics.
				235	///
				236	/// # Example
				237	///
				238	/// ```
				239	/// use regex_automata::Regex;
				240	///
				241	/// # fn example() -> Result<(), regex_automata::Error> {
				242	/// let re = Regex::new("foo[0-9]+")?;
				243	/// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
				244	///
				245	/// // Even though a match is found after reading the first byte (`a`),
				246	/// // the leftmost first match semantics demand that we find the earliest
				247	/// // match that prefers earlier parts of the pattern over latter parts.
				248	/// let re = Regex::new("abc\|a")?;
				249	/// assert_eq!(Some((0, 3)), re.find(b"abc"));
				250	/// # Ok(()) }; example().unwrap()
				251	/// ```
				252	pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
				253	self.find_at(input, 0)
				254	}
				255
				256	/// Returns the same as `is_match`, but starts the search at the given
				257	/// offset.
				258	///
				259	/// The significance of the starting point is that it takes the surrounding
				260	/// context into consideration. For example, if the DFA is anchored, then
				261	/// a match can only occur when `start == 0`.
				262	pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
				263	self.forward().is_match_at(input, start)
				264	}
				265
				266	/// Returns the same as `shortest_match`, but starts the search at the
				267	/// given offset.
				268	///
				269	/// The significance of the starting point is that it takes the surrounding
				270	/// context into consideration. For example, if the DFA is anchored, then
				271	/// a match can only occur when `start == 0`.
				272	pub fn shortest_match_at(
				273	&self,
				274	input: &[u8],
				275	start: usize,
				276	) -> Option<usize> {
				277	self.forward().shortest_match_at(input, start)
				278	}
				279
				280	/// Returns the same as `find`, but starts the search at the given
				281	/// offset.
				282	///
				283	/// The significance of the starting point is that it takes the surrounding
				284	/// context into consideration. For example, if the DFA is anchored, then
				285	/// a match can only occur when `start == 0`.
				286	pub fn find_at(
				287	&self,
				288	input: &[u8],
				289	start: usize,
				290	) -> Option<(usize, usize)> {
				291	let end = match self.forward().find_at(input, start) {
				292	None => return None,
				293	Some(end) => end,
				294	};
				295	let start = self
				296	.reverse()
				297	.rfind(&input[start..end])
				298	.map(\|i\| start + i)
				299	.expect("reverse search must match if forward search does");
				300	Some((start, end))
				301	}
				302
				303	/// Returns an iterator over all non-overlapping leftmost first matches
				304	/// in the given bytes. If no match exists, then the iterator yields no
				305	/// elements.
				306	///
				307	/// Note that if the regex can match the empty string, then it is
				308	/// possible for the iterator to yield a zero-width match at a location
				309	/// that is not a valid UTF-8 boundary (for example, between the code units
				310	/// of a UTF-8 encoded codepoint). This can happen regardless of whether
				311	/// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
				312	/// was enabled or not.
				313	///
				314	/// # Example
				315	///
				316	/// ```
				317	/// use regex_automata::Regex;
				318	///
				319	/// # fn example() -> Result<(), regex_automata::Error> {
				320	/// let re = Regex::new("foo[0-9]+")?;
				321	/// let text = b"foo1 foo12 foo123";
				322	/// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
				323	/// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
				324	/// # Ok(()) }; example().unwrap()
				325	/// ```
				326	pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
				327	Matches::new(self, input)
				328	}
				329
				330	/// Build a new regex from its constituent forward and reverse DFAs.
				331	///
				332	/// This is useful when deserializing a regex from some arbitrary
				333	/// memory region. This is also useful for building regexes from other
				334	/// types of DFAs.
				335	///
				336	/// # Example
				337	///
				338	/// This example is a bit a contrived. The usual use of these methods
				339	/// would involve serializing `initial_re` somewhere and then deserializing
				340	/// it later to build a regex.
				341	///
				342	/// ```
				343	/// use regex_automata::Regex;
				344	///
				345	/// # fn example() -> Result<(), regex_automata::Error> {
				346	/// let initial_re = Regex::new("foo[0-9]+")?;
				347	/// assert_eq!(true, initial_re.is_match(b"foo123"));
				348	///
				349	/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
				350	/// let re = Regex::from_dfas(fwd, rev);
				351	/// assert_eq!(true, re.is_match(b"foo123"));
				352	/// # Ok(()) }; example().unwrap()
				353	/// ```
				354	///
				355	/// This example shows how you might build smaller DFAs, and then use those
				356	/// smaller DFAs to build a new regex.
				357	///
				358	/// ```
				359	/// use regex_automata::Regex;
				360	///
				361	/// # fn example() -> Result<(), regex_automata::Error> {
				362	/// let initial_re = Regex::new("foo[0-9]+")?;
				363	/// assert_eq!(true, initial_re.is_match(b"foo123"));
				364	///
				365	/// let fwd = initial_re.forward().to_u16()?;
				366	/// let rev = initial_re.reverse().to_u16()?;
				367	/// let re = Regex::from_dfas(fwd, rev);
				368	/// assert_eq!(true, re.is_match(b"foo123"));
				369	/// # Ok(()) }; example().unwrap()
				370	/// ```
				371	///
				372	/// This example shows how to build a `Regex` that uses sparse DFAs instead
				373	/// of dense DFAs:
				374	///
				375	/// ```
				376	/// use regex_automata::Regex;
				377	///
				378	/// # fn example() -> Result<(), regex_automata::Error> {
				379	/// let initial_re = Regex::new("foo[0-9]+")?;
				380	/// assert_eq!(true, initial_re.is_match(b"foo123"));
				381	///
				382	/// let fwd = initial_re.forward().to_sparse()?;
				383	/// let rev = initial_re.reverse().to_sparse()?;
				384	/// let re = Regex::from_dfas(fwd, rev);
				385	/// assert_eq!(true, re.is_match(b"foo123"));
				386	/// # Ok(()) }; example().unwrap()
				387	/// ```
				388	pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
				389	Regex { forward, reverse }
				390	}
				391
				392	/// Return the underlying DFA responsible for forward matching.
				393	pub fn forward(&self) -> &D {
				394	&self.forward
				395	}
				396
				397	/// Return the underlying DFA responsible for reverse matching.
				398	pub fn reverse(&self) -> &D {
				399	&self.reverse
				400	}
				401	}
				402
				403	/// An iterator over all non-overlapping matches for a particular search.
				404	///
				405	/// The iterator yields a `(usize, usize)` value until no more matches could be
				406	/// found. The first `usize` is the start of the match (inclusive) while the
				407	/// second `usize` is the end of the match (exclusive).
				408	///
				409	/// `S` is the type used to represent state identifiers in the underlying
				410	/// regex. The lifetime variables are as follows:
				411	///
				412	/// * `'r` is the lifetime of the regular expression value itself.
				413	/// * `'t` is the lifetime of the text being searched.
				414	#[derive(Clone, Debug)]
				415	pub struct Matches<'r, 't, D: DFA + 'r> {
				416	re: &'r Regex<D>,
				417	text: &'t [u8],
				418	last_end: usize,
				419	last_match: Option<usize>,
				420	}
				421
				422	impl<'r, 't, D: DFA> Matches<'r, 't, D> {
				423	fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
				424	Matches { re, text, last_end: 0, last_match: None }
				425	}
				426	}
				427
				428	impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
				429	type Item = (usize, usize);
				430
				431	fn next(&mut self) -> Option<(usize, usize)> {
				432	if self.last_end > self.text.len() {
				433	return None;
				434	}
				435	let (s, e) = match self.re.find_at(self.text, self.last_end) {
				436	None => return None,
				437	Some((s, e)) => (s, e),
				438	};
				439	if s == e {
				440	// This is an empty match. To ensure we make progress, start
				441	// the next search at the smallest possible starting position
				442	// of the next match following this one.
				443	self.last_end = e + 1;
				444	// Don't accept empty matches immediately following a match.
				445	// Just move on to the next match.
				446	if Some(e) == self.last_match {
				447	return self.next();
				448	}
				449	} else {
				450	self.last_end = e;
				451	}
				452	self.last_match = Some(e);
				453	Some((s, e))
				454	}
				455	}
				456
				457	/// A builder for a regex based on deterministic finite automatons.
				458	///
				459	/// This builder permits configuring several aspects of the construction
				460	/// process such as case insensitivity, Unicode support and various options
				461	/// that impact the size of the underlying DFAs. In some cases, options (like
				462	/// performing DFA minimization) can come with a substantial additional cost.
				463	///
				464	/// This builder generally constructs two DFAs, where one is responsible for
				465	/// finding the end of a match and the other is responsible for finding the
				466	/// start of a match. If you only need to detect whether something matched,
				467	/// or only the end of a match, then you should use a
				468	/// [`dense::Builder`](dense/struct.Builder.html)
				469	/// to construct a single DFA, which is cheaper than building two DFAs.
				470	#[cfg(feature = "std")]
				471	#[derive(Clone, Debug)]
				472	pub struct RegexBuilder {
				473	dfa: dense::Builder,
				474	}
				475
				476	#[cfg(feature = "std")]
				477	impl RegexBuilder {
				478	/// Create a new regex builder with the default configuration.
				479	pub fn new() -> RegexBuilder {
				480	RegexBuilder { dfa: dense::Builder::new() }
				481	}
				482
				483	/// Build a regex from the given pattern.
				484	///
				485	/// If there was a problem parsing or compiling the pattern, then an error
				486	/// is returned.
				487	pub fn build(&self, pattern: &str) -> Result<Regex> {
				488	self.build_with_size::<usize>(pattern)
				489	}
				490
				491	/// Build a regex from the given pattern using sparse DFAs.
				492	///
				493	/// If there was a problem parsing or compiling the pattern, then an error
				494	/// is returned.
				495	pub fn build_sparse(
				496	&self,
				497	pattern: &str,
				498	) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
				499	self.build_with_size_sparse::<usize>(pattern)
				500	}
				501
				502	/// Build a regex from the given pattern using a specific representation
				503	/// for the underlying DFA state IDs.
				504	///
				505	/// If there was a problem parsing or compiling the pattern, then an error
				506	/// is returned.
				507	///
				508	/// The representation of state IDs is determined by the `S` type
				509	/// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
				510	/// or `usize`, where `usize` is the default used for `build`. The purpose
				511	/// of specifying a representation for state IDs is to reduce the memory
				512	/// footprint of the underlying DFAs.
				513	///
				514	/// When using this routine, the chosen state ID representation will be
				515	/// used throughout determinization and minimization, if minimization was
				516	/// requested. Even if the minimized DFAs can fit into the chosen state ID
				517	/// representation but the initial determinized DFA cannot, then this will
				518	/// still return an error. To get a minimized DFA with a smaller state ID
				519	/// representation, first build it with a bigger state ID representation,
				520	/// and then shrink the sizes of the DFAs using one of its conversion
				521	/// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
				522	/// Finally, reconstitute the regex via
				523	/// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
				524	pub fn build_with_size<S: StateID>(
				525	&self,
				526	pattern: &str,
				527	) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
				528	let forward = self.dfa.build_with_size(pattern)?;
				529	let reverse = self
				530	.dfa
				531	.clone()
				532	.anchored(true)
				533	.reverse(true)
				534	.longest_match(true)
				535	.build_with_size(pattern)?;
				536	Ok(Regex::from_dfas(forward, reverse))
				537	}
				538
				539	/// Build a regex from the given pattern using a specific representation
				540	/// for the underlying DFA state IDs using sparse DFAs.
				541	pub fn build_with_size_sparse<S: StateID>(
				542	&self,
				543	pattern: &str,
				544	) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
				545	let re = self.build_with_size(pattern)?;
				546	let fwd = re.forward().to_sparse()?;
				547	let rev = re.reverse().to_sparse()?;
				548	Ok(Regex::from_dfas(fwd, rev))
				549	}
				550
				551	/// Set whether matching must be anchored at the beginning of the input.
				552	///
				553	/// When enabled, a match must begin at the start of the input. When
				554	/// disabled, the regex will act as if the pattern started with a `.*?`,
				555	/// which enables a match to appear anywhere.
				556	///
				557	/// By default this is disabled.
				558	pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
				559	self.dfa.anchored(yes);
				560	self
				561	}
				562
				563	/// Enable or disable the case insensitive flag by default.
				564	///
				565	/// By default this is disabled. It may alternatively be selectively
				566	/// enabled in the regular expression itself via the `i` flag.
				567	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
				568	self.dfa.case_insensitive(yes);
				569	self
				570	}
				571
				572	/// Enable verbose mode in the regular expression.
				573	///
				574	/// When enabled, verbose mode permits insigificant whitespace in many
				575	/// places in the regular expression, as well as comments. Comments are
				576	/// started using `#` and continue until the end of the line.
				577	///
				578	/// By default, this is disabled. It may be selectively enabled in the
				579	/// regular expression by using the `x` flag regardless of this setting.
				580	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
				581	self.dfa.ignore_whitespace(yes);
				582	self
				583	}
				584
				585	/// Enable or disable the "dot matches any character" flag by default.
				586	///
				587	/// By default this is disabled. It may alternatively be selectively
				588	/// enabled in the regular expression itself via the `s` flag.
				589	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
				590	self.dfa.dot_matches_new_line(yes);
				591	self
				592	}
				593
				594	/// Enable or disable the "swap greed" flag by default.
				595	///
				596	/// By default this is disabled. It may alternatively be selectively
				597	/// enabled in the regular expression itself via the `U` flag.
				598	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
				599	self.dfa.swap_greed(yes);
				600	self
				601	}
				602
				603	/// Enable or disable the Unicode flag (`u`) by default.
				604	///
				605	/// By default this is enabled. It may alternatively be selectively
				606	/// disabled in the regular expression itself via the `u` flag.
				607	///
				608	/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
				609	/// default), a regular expression will fail to parse if Unicode mode is
				610	/// disabled and a sub-expression could possibly match invalid UTF-8.
				611	pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
				612	self.dfa.unicode(yes);
				613	self
				614	}
				615
				616	/// When enabled, the builder will permit the construction of a regular
				617	/// expression that may match invalid UTF-8.
				618	///
				619	/// When disabled (the default), the builder is guaranteed to produce a
				620	/// regex that will only ever match valid UTF-8 (otherwise, the builder
				621	/// will return an error).
				622	pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
				623	self.dfa.allow_invalid_utf8(yes);
				624	self
				625	}
				626
				627	/// Set the nesting limit used for the regular expression parser.
				628	///
				629	/// The nesting limit controls how deep the abstract syntax tree is allowed
				630	/// to be. If the AST exceeds the given limit (e.g., with too many nested
				631	/// groups), then an error is returned by the parser.
				632	///
				633	/// The purpose of this limit is to act as a heuristic to prevent stack
				634	/// overflow when building a finite automaton from a regular expression's
				635	/// abstract syntax tree. In particular, construction currently uses
				636	/// recursion. In the future, the implementation may stop using recursion
				637	/// and this option will no longer be necessary.
				638	///
				639	/// This limit is not checked until the entire AST is parsed. Therefore,
				640	/// if callers want to put a limit on the amount of heap space used, then
				641	/// they should impose a limit on the length, in bytes, of the concrete
				642	/// pattern string. In particular, this is viable since the parser will
				643	/// limit itself to heap space proportional to the lenth of the pattern
				644	/// string.
				645	///
				646	/// Note that a nest limit of `0` will return a nest limit error for most
				647	/// patterns but not all. For example, a nest limit of `0` permits `a` but
				648	/// not `ab`, since `ab` requires a concatenation AST item, which results
				649	/// in a nest depth of `1`. In general, a nest limit is not something that
				650	/// manifests in an obvious way in the concrete syntax, therefore, it
				651	/// should not be used in a granular way.
				652	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
				653	self.dfa.nest_limit(limit);
				654	self
				655	}
				656
				657	/// Minimize the underlying DFAs.
				658	///
				659	/// When enabled, the DFAs powering the resulting regex will be minimized
				660	/// such that it is as small as possible.
				661	///
				662	/// Whether one enables minimization or not depends on the types of costs
				663	/// you're willing to pay and how much you care about its benefits. In
				664	/// particular, minimization has worst case `O(nklogn)` time and `O(k*n)`
				665	/// space, where `n` is the number of DFA states and `k` is the alphabet
				666	/// size. In practice, minimization can be quite costly in terms of both
				667	/// space and time, so it should only be done if you're willing to wait
				668	/// longer to produce a DFA. In general, you might want a minimal DFA in
				669	/// the following circumstances:
				670	///
				671	/// 1. You would like to optimize for the size of the automaton. This can
				672	/// manifest in one of two ways. Firstly, if you're converting the
				673	/// DFA into Rust code (or a table embedded in the code), then a minimal
				674	/// DFA will translate into a corresponding reduction in code size, and
				675	/// thus, also the final compiled binary size. Secondly, if you are
				676	/// building many DFAs and putting them on the heap, you'll be able to
				677	/// fit more if they are smaller. Note though that building a minimal
				678	/// DFA itself requires additional space; you only realize the space
				679	/// savings once the minimal DFA is constructed (at which point, the
				680	/// space used for minimization is freed).
				681	/// 2. You've observed that a smaller DFA results in faster match
				682	/// performance. Naively, this isn't guaranteed since there is no
				683	/// inherent difference between matching with a bigger-than-minimal
				684	/// DFA and a minimal DFA. However, a smaller DFA may make use of your
				685	/// CPU's cache more efficiently.
				686	/// 3. You are trying to establish an equivalence between regular
				687	/// languages. The standard method for this is to build a minimal DFA
				688	/// for each language and then compare them. If the DFAs are equivalent
				689	/// (up to state renaming), then the languages are equivalent.
				690	///
				691	/// This option is disabled by default.
				692	pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
				693	self.dfa.minimize(yes);
				694	self
				695	}
				696
				697	/// Premultiply state identifiers in the underlying DFA transition tables.
				698	///
				699	/// When enabled, state identifiers are premultiplied to point to their
				700	/// corresponding row in the DFA's transition table. That is, given the
				701	/// `i`th state, its corresponding premultiplied identifier is `i * k`
				702	/// where `k` is the alphabet size of the DFA. (The alphabet size is at
				703	/// most 256, but is in practice smaller if byte classes is enabled.)
				704	///
				705	/// When state identifiers are not premultiplied, then the identifier of
				706	/// the `i`th state is `i`.
				707	///
				708	/// The advantage of premultiplying state identifiers is that is saves
				709	/// a multiplication instruction per byte when searching with the DFA.
				710	/// This has been observed to lead to a 20% performance benefit in
				711	/// micro-benchmarks.
				712	///
				713	/// The primary disadvantage of premultiplying state identifiers is
				714	/// that they require a larger integer size to represent. For example,
				715	/// if your DFA has 200 states, then its premultiplied form requires
				716	/// 16 bits to represent every possible state identifier, where as its
				717	/// non-premultiplied form only requires 8 bits.
				718	///
				719	/// This option is enabled by default.
				720	pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
				721	self.dfa.premultiply(yes);
				722	self
				723	}
				724
				725	/// Shrink the size of the underlying DFA alphabet by mapping bytes to
				726	/// their equivalence classes.
				727	///
				728	/// When enabled, each DFA will use a map from all possible bytes to their
				729	/// corresponding equivalence class. Each equivalence class represents a
				730	/// set of bytes that does not discriminate between a match and a non-match
				731	/// in the DFA. For example, the pattern `[ab]+` has at least two
				732	/// equivalence classes: a set containing `a` and `b` and a set containing
				733	/// every byte except for `a` and `b`. `a` and `b` are in the same
				734	/// equivalence classes because they never discriminate between a match
				735	/// and a non-match.
				736	///
				737	/// The advantage of this map is that the size of the transition table can
				738	/// be reduced drastically from `#states * 256 * sizeof(id)` to
				739	/// `#states * k * sizeof(id)` where `k` is the number of equivalence
				740	/// classes. As a result, total space usage can decrease substantially.
				741	/// Moreover, since a smaller alphabet is used, compilation becomes faster
				742	/// as well.
				743	///
				744	/// The disadvantage of this map is that every byte searched must be
				745	/// passed through this map before it can be used to determine the next
				746	/// transition. This has a small match time performance cost.
				747	///
				748	/// This option is enabled by default.
				749	pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
				750	self.dfa.byte_classes(yes);
				751	self
				752	}
				753
				754	/// Apply best effort heuristics to shrink the NFA at the expense of more
				755	/// time/memory.
				756	///
				757	/// This may be exposed in the future, but for now is exported for use in
				758	/// the `regex-automata-debug` tool.
				759	#[doc(hidden)]
				760	pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
				761	self.dfa.shrink(yes);
				762	self
				763	}
				764	}
				765
				766	#[cfg(feature = "std")]
				767	impl Default for RegexBuilder {
				768	fn default() -> RegexBuilder {
				769	RegexBuilder::new()
				770	}
				771	}