| #[cfg(feature = "std")] |
| use dense::{self, DenseDFA}; |
| use dfa::DFA; |
| #[cfg(feature = "std")] |
| use error::Result; |
| #[cfg(feature = "std")] |
| use sparse::SparseDFA; |
| #[cfg(feature = "std")] |
| use state_id::StateID; |
| |
| /// A regular expression that uses deterministic finite automata for fast |
| /// searching. |
| /// |
| /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
| /// "reverse" DFA. The forward DFA is responsible for detecting the end of a |
| /// match while the reverse DFA is responsible for detecting the start of a |
| /// match. Thus, in order to find the bounds of any given match, a forward |
| /// search must first be run followed by a reverse search. A match found by |
| /// the forward DFA guarantees that the reverse DFA will also find a match. |
| /// |
| /// The type of the DFA used by a `Regex` corresponds to the `D` type |
| /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, |
| /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a |
| /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but |
| /// search faster, while sparse DFAs use less memory but search more slowly. |
| /// |
| /// By default, a regex's DFA type parameter is set to |
| /// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the |
| /// most convenient type that gives the best search performance. |
| /// |
| /// # Sparse DFAs |
| /// |
| /// Since a `Regex` is generic over the `DFA` trait, it can be used with any |
| /// kind of DFA. While this crate constructs dense DFAs by default, it is easy |
| /// enough to build corresponding sparse DFAs, and then build a regex from |
| /// them: |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// // First, build a regex that uses dense DFAs. |
| /// let dense_re = Regex::new("foo[0-9]+")?; |
| /// |
| /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
| /// let fwd = dense_re.forward().to_sparse()?; |
| /// let rev = dense_re.reverse().to_sparse()?; |
| /// |
| /// // Third, build a new regex from the constituent sparse DFAs. |
| /// let sparse_re = Regex::from_dfas(fwd, rev); |
| /// |
| /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
| /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| #[cfg(feature = "std")] |
| #[derive(Clone, Debug)] |
| pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> { |
| forward: D, |
| reverse: D, |
| } |
| |
| /// A regular expression that uses deterministic finite automata for fast |
| /// searching. |
| /// |
| /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
| /// "reverse" DFA. The forward DFA is responsible for detecting the end of a |
| /// match while the reverse DFA is responsible for detecting the start of a |
| /// match. Thus, in order to find the bounds of any given match, a forward |
| /// search must first be run followed by a reverse search. A match found by |
| /// the forward DFA guarantees that the reverse DFA will also find a match. |
| /// |
| /// The type of the DFA used by a `Regex` corresponds to the `D` type |
| /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, |
| /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a |
| /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but |
| /// search faster, while sparse DFAs use less memory but search more slowly. |
| /// |
| /// When using this crate without the standard library, the `Regex` type has |
| /// no default type parameter. |
| /// |
| /// # Sparse DFAs |
| /// |
| /// Since a `Regex` is generic over the `DFA` trait, it can be used with any |
| /// kind of DFA. While this crate constructs dense DFAs by default, it is easy |
| /// enough to build corresponding sparse DFAs, and then build a regex from |
| /// them: |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// // First, build a regex that uses dense DFAs. |
| /// let dense_re = Regex::new("foo[0-9]+")?; |
| /// |
| /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
| /// let fwd = dense_re.forward().to_sparse()?; |
| /// let rev = dense_re.reverse().to_sparse()?; |
| /// |
| /// // Third, build a new regex from the constituent sparse DFAs. |
| /// let sparse_re = Regex::from_dfas(fwd, rev); |
| /// |
| /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
| /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| #[cfg(not(feature = "std"))] |
| #[derive(Clone, Debug)] |
| pub struct Regex<D> { |
| forward: D, |
| reverse: D, |
| } |
| |
| #[cfg(feature = "std")] |
| impl Regex { |
| /// Parse the given regular expression using a default configuration and |
| /// return the corresponding regex. |
| /// |
| /// The default configuration uses `usize` for state IDs, premultiplies |
| /// them and reduces the alphabet size by splitting bytes into equivalence |
| /// classes. The underlying DFAs are *not* minimized. |
| /// |
| /// If you want a non-default configuration, then use the |
| /// [`RegexBuilder`](struct.RegexBuilder.html) |
| /// to set your own configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new("foo[0-9]+bar")?; |
| /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn new(pattern: &str) -> Result<Regex> { |
| RegexBuilder::new().build(pattern) |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| impl Regex<SparseDFA<Vec<u8>, usize>> { |
| /// Parse the given regular expression using a default configuration and |
| /// return the corresponding regex using sparse DFAs. |
| /// |
| /// The default configuration uses `usize` for state IDs, reduces the |
| /// alphabet size by splitting bytes into equivalence classes. The |
| /// underlying DFAs are *not* minimized. |
| /// |
| /// If you want a non-default configuration, then use the |
| /// [`RegexBuilder`](struct.RegexBuilder.html) |
| /// to set your own configuration. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new_sparse("foo[0-9]+bar")?; |
| /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn new_sparse( |
| pattern: &str, |
| ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { |
| RegexBuilder::new().build_sparse(pattern) |
| } |
| } |
| |
| impl<D: DFA> Regex<D> { |
| /// Returns true if and only if the given bytes match. |
| /// |
| /// This routine may short circuit if it knows that scanning future input |
| /// will never lead to a different result. In particular, if the underlying |
| /// DFA enters a match state or a dead state, then this routine will return |
| /// `true` or `false`, respectively, without inspecting any future input. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new("foo[0-9]+bar")?; |
| /// assert_eq!(true, re.is_match(b"foo12345bar")); |
| /// assert_eq!(false, re.is_match(b"foobar")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn is_match(&self, input: &[u8]) -> bool { |
| self.is_match_at(input, 0) |
| } |
| |
| /// Returns the first position at which a match is found. |
| /// |
| /// This routine stops scanning input in precisely the same circumstances |
| /// as `is_match`. The key difference is that this routine returns the |
| /// position at which it stopped scanning input if and only if a match |
| /// was found. If no match is found, then `None` is returned. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(Some(4), re.shortest_match(b"foo12345")); |
| /// |
| /// // Normally, the end of the leftmost first match here would be 3, |
| /// // but the shortest match semantics detect a match earlier. |
| /// let re = Regex::new("abc|a")?; |
| /// assert_eq!(Some(1), re.shortest_match(b"abc")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn shortest_match(&self, input: &[u8]) -> Option<usize> { |
| self.shortest_match_at(input, 0) |
| } |
| |
| /// Returns the start and end offset of the leftmost first match. If no |
| /// match exists, then `None` is returned. |
| /// |
| /// The "leftmost first" match corresponds to the match with the smallest |
| /// starting offset, but where the end offset is determined by preferring |
| /// earlier branches in the original regular expression. For example, |
| /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will |
| /// match `Samwise` in `Samwise`. |
| /// |
| /// Generally speaking, the "leftmost first" match is how most backtracking |
| /// regular expressions tend to work. This is in contrast to POSIX-style |
| /// regular expressions that yield "leftmost longest" matches. Namely, |
| /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
| /// leftmost longest semantics. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz")); |
| /// |
| /// // Even though a match is found after reading the first byte (`a`), |
| /// // the leftmost first match semantics demand that we find the earliest |
| /// // match that prefers earlier parts of the pattern over latter parts. |
| /// let re = Regex::new("abc|a")?; |
| /// assert_eq!(Some((0, 3)), re.find(b"abc")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> { |
| self.find_at(input, 0) |
| } |
| |
| /// Returns the same as `is_match`, but starts the search at the given |
| /// offset. |
| /// |
| /// The significance of the starting point is that it takes the surrounding |
| /// context into consideration. For example, if the DFA is anchored, then |
| /// a match can only occur when `start == 0`. |
| pub fn is_match_at(&self, input: &[u8], start: usize) -> bool { |
| self.forward().is_match_at(input, start) |
| } |
| |
| /// Returns the same as `shortest_match`, but starts the search at the |
| /// given offset. |
| /// |
| /// The significance of the starting point is that it takes the surrounding |
| /// context into consideration. For example, if the DFA is anchored, then |
| /// a match can only occur when `start == 0`. |
| pub fn shortest_match_at( |
| &self, |
| input: &[u8], |
| start: usize, |
| ) -> Option<usize> { |
| self.forward().shortest_match_at(input, start) |
| } |
| |
| /// Returns the same as `find`, but starts the search at the given |
| /// offset. |
| /// |
| /// The significance of the starting point is that it takes the surrounding |
| /// context into consideration. For example, if the DFA is anchored, then |
| /// a match can only occur when `start == 0`. |
| pub fn find_at( |
| &self, |
| input: &[u8], |
| start: usize, |
| ) -> Option<(usize, usize)> { |
| let end = match self.forward().find_at(input, start) { |
| None => return None, |
| Some(end) => end, |
| }; |
| let start = self |
| .reverse() |
| .rfind(&input[start..end]) |
| .map(|i| start + i) |
| .expect("reverse search must match if forward search does"); |
| Some((start, end)) |
| } |
| |
| /// Returns an iterator over all non-overlapping leftmost first matches |
| /// in the given bytes. If no match exists, then the iterator yields no |
| /// elements. |
| /// |
| /// Note that if the regex can match the empty string, then it is |
| /// possible for the iterator to yield a zero-width match at a location |
| /// that is not a valid UTF-8 boundary (for example, between the code units |
| /// of a UTF-8 encoded codepoint). This can happen regardless of whether |
| /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) |
| /// was enabled or not. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let re = Regex::new("foo[0-9]+")?; |
| /// let text = b"foo1 foo12 foo123"; |
| /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); |
| /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> { |
| Matches::new(self, input) |
| } |
| |
| /// Build a new regex from its constituent forward and reverse DFAs. |
| /// |
| /// This is useful when deserializing a regex from some arbitrary |
| /// memory region. This is also useful for building regexes from other |
| /// types of DFAs. |
| /// |
| /// # Example |
| /// |
| /// This example is a bit a contrived. The usual use of these methods |
| /// would involve serializing `initial_re` somewhere and then deserializing |
| /// it later to build a regex. |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let initial_re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(true, initial_re.is_match(b"foo123")); |
| /// |
| /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
| /// let re = Regex::from_dfas(fwd, rev); |
| /// assert_eq!(true, re.is_match(b"foo123")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| /// |
| /// This example shows how you might build smaller DFAs, and then use those |
| /// smaller DFAs to build a new regex. |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let initial_re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(true, initial_re.is_match(b"foo123")); |
| /// |
| /// let fwd = initial_re.forward().to_u16()?; |
| /// let rev = initial_re.reverse().to_u16()?; |
| /// let re = Regex::from_dfas(fwd, rev); |
| /// assert_eq!(true, re.is_match(b"foo123")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| /// |
| /// This example shows how to build a `Regex` that uses sparse DFAs instead |
| /// of dense DFAs: |
| /// |
| /// ``` |
| /// use regex_automata::Regex; |
| /// |
| /// # fn example() -> Result<(), regex_automata::Error> { |
| /// let initial_re = Regex::new("foo[0-9]+")?; |
| /// assert_eq!(true, initial_re.is_match(b"foo123")); |
| /// |
| /// let fwd = initial_re.forward().to_sparse()?; |
| /// let rev = initial_re.reverse().to_sparse()?; |
| /// let re = Regex::from_dfas(fwd, rev); |
| /// assert_eq!(true, re.is_match(b"foo123")); |
| /// # Ok(()) }; example().unwrap() |
| /// ``` |
| pub fn from_dfas(forward: D, reverse: D) -> Regex<D> { |
| Regex { forward, reverse } |
| } |
| |
| /// Return the underlying DFA responsible for forward matching. |
| pub fn forward(&self) -> &D { |
| &self.forward |
| } |
| |
| /// Return the underlying DFA responsible for reverse matching. |
| pub fn reverse(&self) -> &D { |
| &self.reverse |
| } |
| } |
| |
| /// An iterator over all non-overlapping matches for a particular search. |
| /// |
| /// The iterator yields a `(usize, usize)` value until no more matches could be |
| /// found. The first `usize` is the start of the match (inclusive) while the |
| /// second `usize` is the end of the match (exclusive). |
| /// |
| /// `S` is the type used to represent state identifiers in the underlying |
| /// regex. The lifetime variables are as follows: |
| /// |
| /// * `'r` is the lifetime of the regular expression value itself. |
| /// * `'t` is the lifetime of the text being searched. |
| #[derive(Clone, Debug)] |
| pub struct Matches<'r, 't, D: DFA + 'r> { |
| re: &'r Regex<D>, |
| text: &'t [u8], |
| last_end: usize, |
| last_match: Option<usize>, |
| } |
| |
| impl<'r, 't, D: DFA> Matches<'r, 't, D> { |
| fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> { |
| Matches { re, text, last_end: 0, last_match: None } |
| } |
| } |
| |
| impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> { |
| type Item = (usize, usize); |
| |
| fn next(&mut self) -> Option<(usize, usize)> { |
| if self.last_end > self.text.len() { |
| return None; |
| } |
| let (s, e) = match self.re.find_at(self.text, self.last_end) { |
| None => return None, |
| Some((s, e)) => (s, e), |
| }; |
| if s == e { |
| // This is an empty match. To ensure we make progress, start |
| // the next search at the smallest possible starting position |
| // of the next match following this one. |
| self.last_end = e + 1; |
| // Don't accept empty matches immediately following a match. |
| // Just move on to the next match. |
| if Some(e) == self.last_match { |
| return self.next(); |
| } |
| } else { |
| self.last_end = e; |
| } |
| self.last_match = Some(e); |
| Some((s, e)) |
| } |
| } |
| |
| /// A builder for a regex based on deterministic finite automatons. |
| /// |
| /// This builder permits configuring several aspects of the construction |
| /// process such as case insensitivity, Unicode support and various options |
| /// that impact the size of the underlying DFAs. In some cases, options (like |
| /// performing DFA minimization) can come with a substantial additional cost. |
| /// |
| /// This builder generally constructs two DFAs, where one is responsible for |
| /// finding the end of a match and the other is responsible for finding the |
| /// start of a match. If you only need to detect whether something matched, |
| /// or only the end of a match, then you should use a |
| /// [`dense::Builder`](dense/struct.Builder.html) |
| /// to construct a single DFA, which is cheaper than building two DFAs. |
| #[cfg(feature = "std")] |
| #[derive(Clone, Debug)] |
| pub struct RegexBuilder { |
| dfa: dense::Builder, |
| } |
| |
| #[cfg(feature = "std")] |
| impl RegexBuilder { |
| /// Create a new regex builder with the default configuration. |
| pub fn new() -> RegexBuilder { |
| RegexBuilder { dfa: dense::Builder::new() } |
| } |
| |
| /// Build a regex from the given pattern. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| pub fn build(&self, pattern: &str) -> Result<Regex> { |
| self.build_with_size::<usize>(pattern) |
| } |
| |
| /// Build a regex from the given pattern using sparse DFAs. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| pub fn build_sparse( |
| &self, |
| pattern: &str, |
| ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { |
| self.build_with_size_sparse::<usize>(pattern) |
| } |
| |
| /// Build a regex from the given pattern using a specific representation |
| /// for the underlying DFA state IDs. |
| /// |
| /// If there was a problem parsing or compiling the pattern, then an error |
| /// is returned. |
| /// |
| /// The representation of state IDs is determined by the `S` type |
| /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` |
| /// or `usize`, where `usize` is the default used for `build`. The purpose |
| /// of specifying a representation for state IDs is to reduce the memory |
| /// footprint of the underlying DFAs. |
| /// |
| /// When using this routine, the chosen state ID representation will be |
| /// used throughout determinization and minimization, if minimization was |
| /// requested. Even if the minimized DFAs can fit into the chosen state ID |
| /// representation but the initial determinized DFA cannot, then this will |
| /// still return an error. To get a minimized DFA with a smaller state ID |
| /// representation, first build it with a bigger state ID representation, |
| /// and then shrink the sizes of the DFAs using one of its conversion |
| /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). |
| /// Finally, reconstitute the regex via |
| /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa). |
| pub fn build_with_size<S: StateID>( |
| &self, |
| pattern: &str, |
| ) -> Result<Regex<DenseDFA<Vec<S>, S>>> { |
| let forward = self.dfa.build_with_size(pattern)?; |
| let reverse = self |
| .dfa |
| .clone() |
| .anchored(true) |
| .reverse(true) |
| .longest_match(true) |
| .build_with_size(pattern)?; |
| Ok(Regex::from_dfas(forward, reverse)) |
| } |
| |
| /// Build a regex from the given pattern using a specific representation |
| /// for the underlying DFA state IDs using sparse DFAs. |
| pub fn build_with_size_sparse<S: StateID>( |
| &self, |
| pattern: &str, |
| ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> { |
| let re = self.build_with_size(pattern)?; |
| let fwd = re.forward().to_sparse()?; |
| let rev = re.reverse().to_sparse()?; |
| Ok(Regex::from_dfas(fwd, rev)) |
| } |
| |
| /// Set whether matching must be anchored at the beginning of the input. |
| /// |
| /// When enabled, a match must begin at the start of the input. When |
| /// disabled, the regex will act as if the pattern started with a `.*?`, |
| /// which enables a match to appear anywhere. |
| /// |
| /// By default this is disabled. |
| pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.anchored(yes); |
| self |
| } |
| |
| /// Enable or disable the case insensitive flag by default. |
| /// |
| /// By default this is disabled. It may alternatively be selectively |
| /// enabled in the regular expression itself via the `i` flag. |
| pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.case_insensitive(yes); |
| self |
| } |
| |
| /// Enable verbose mode in the regular expression. |
| /// |
| /// When enabled, verbose mode permits insigificant whitespace in many |
| /// places in the regular expression, as well as comments. Comments are |
| /// started using `#` and continue until the end of the line. |
| /// |
| /// By default, this is disabled. It may be selectively enabled in the |
| /// regular expression by using the `x` flag regardless of this setting. |
| pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.ignore_whitespace(yes); |
| self |
| } |
| |
| /// Enable or disable the "dot matches any character" flag by default. |
| /// |
| /// By default this is disabled. It may alternatively be selectively |
| /// enabled in the regular expression itself via the `s` flag. |
| pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.dot_matches_new_line(yes); |
| self |
| } |
| |
| /// Enable or disable the "swap greed" flag by default. |
| /// |
| /// By default this is disabled. It may alternatively be selectively |
| /// enabled in the regular expression itself via the `U` flag. |
| pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.swap_greed(yes); |
| self |
| } |
| |
| /// Enable or disable the Unicode flag (`u`) by default. |
| /// |
| /// By default this is **enabled**. It may alternatively be selectively |
| /// disabled in the regular expression itself via the `u` flag. |
| /// |
| /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by |
| /// default), a regular expression will fail to parse if Unicode mode is |
| /// disabled and a sub-expression could possibly match invalid UTF-8. |
| pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.unicode(yes); |
| self |
| } |
| |
| /// When enabled, the builder will permit the construction of a regular |
| /// expression that may match invalid UTF-8. |
| /// |
| /// When disabled (the default), the builder is guaranteed to produce a |
| /// regex that will only ever match valid UTF-8 (otherwise, the builder |
| /// will return an error). |
| pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.allow_invalid_utf8(yes); |
| self |
| } |
| |
| /// Set the nesting limit used for the regular expression parser. |
| /// |
| /// The nesting limit controls how deep the abstract syntax tree is allowed |
| /// to be. If the AST exceeds the given limit (e.g., with too many nested |
| /// groups), then an error is returned by the parser. |
| /// |
| /// The purpose of this limit is to act as a heuristic to prevent stack |
| /// overflow when building a finite automaton from a regular expression's |
| /// abstract syntax tree. In particular, construction currently uses |
| /// recursion. In the future, the implementation may stop using recursion |
| /// and this option will no longer be necessary. |
| /// |
| /// This limit is not checked until the entire AST is parsed. Therefore, |
| /// if callers want to put a limit on the amount of heap space used, then |
| /// they should impose a limit on the length, in bytes, of the concrete |
| /// pattern string. In particular, this is viable since the parser will |
| /// limit itself to heap space proportional to the lenth of the pattern |
| /// string. |
| /// |
| /// Note that a nest limit of `0` will return a nest limit error for most |
| /// patterns but not all. For example, a nest limit of `0` permits `a` but |
| /// not `ab`, since `ab` requires a concatenation AST item, which results |
| /// in a nest depth of `1`. In general, a nest limit is not something that |
| /// manifests in an obvious way in the concrete syntax, therefore, it |
| /// should not be used in a granular way. |
| pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
| self.dfa.nest_limit(limit); |
| self |
| } |
| |
| /// Minimize the underlying DFAs. |
| /// |
| /// When enabled, the DFAs powering the resulting regex will be minimized |
| /// such that it is as small as possible. |
| /// |
| /// Whether one enables minimization or not depends on the types of costs |
| /// you're willing to pay and how much you care about its benefits. In |
| /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` |
| /// space, where `n` is the number of DFA states and `k` is the alphabet |
| /// size. In practice, minimization can be quite costly in terms of both |
| /// space and time, so it should only be done if you're willing to wait |
| /// longer to produce a DFA. In general, you might want a minimal DFA in |
| /// the following circumstances: |
| /// |
| /// 1. You would like to optimize for the size of the automaton. This can |
| /// manifest in one of two ways. Firstly, if you're converting the |
| /// DFA into Rust code (or a table embedded in the code), then a minimal |
| /// DFA will translate into a corresponding reduction in code size, and |
| /// thus, also the final compiled binary size. Secondly, if you are |
| /// building many DFAs and putting them on the heap, you'll be able to |
| /// fit more if they are smaller. Note though that building a minimal |
| /// DFA itself requires additional space; you only realize the space |
| /// savings once the minimal DFA is constructed (at which point, the |
| /// space used for minimization is freed). |
| /// 2. You've observed that a smaller DFA results in faster match |
| /// performance. Naively, this isn't guaranteed since there is no |
| /// inherent difference between matching with a bigger-than-minimal |
| /// DFA and a minimal DFA. However, a smaller DFA may make use of your |
| /// CPU's cache more efficiently. |
| /// 3. You are trying to establish an equivalence between regular |
| /// languages. The standard method for this is to build a minimal DFA |
| /// for each language and then compare them. If the DFAs are equivalent |
| /// (up to state renaming), then the languages are equivalent. |
| /// |
| /// This option is disabled by default. |
| pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.minimize(yes); |
| self |
| } |
| |
| /// Premultiply state identifiers in the underlying DFA transition tables. |
| /// |
| /// When enabled, state identifiers are premultiplied to point to their |
| /// corresponding row in the DFA's transition table. That is, given the |
| /// `i`th state, its corresponding premultiplied identifier is `i * k` |
| /// where `k` is the alphabet size of the DFA. (The alphabet size is at |
| /// most 256, but is in practice smaller if byte classes is enabled.) |
| /// |
| /// When state identifiers are not premultiplied, then the identifier of |
| /// the `i`th state is `i`. |
| /// |
| /// The advantage of premultiplying state identifiers is that is saves |
| /// a multiplication instruction per byte when searching with the DFA. |
| /// This has been observed to lead to a 20% performance benefit in |
| /// micro-benchmarks. |
| /// |
| /// The primary disadvantage of premultiplying state identifiers is |
| /// that they require a larger integer size to represent. For example, |
| /// if your DFA has 200 states, then its premultiplied form requires |
| /// 16 bits to represent every possible state identifier, where as its |
| /// non-premultiplied form only requires 8 bits. |
| /// |
| /// This option is enabled by default. |
| pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.premultiply(yes); |
| self |
| } |
| |
| /// Shrink the size of the underlying DFA alphabet by mapping bytes to |
| /// their equivalence classes. |
| /// |
| /// When enabled, each DFA will use a map from all possible bytes to their |
| /// corresponding equivalence class. Each equivalence class represents a |
| /// set of bytes that does not discriminate between a match and a non-match |
| /// in the DFA. For example, the pattern `[ab]+` has at least two |
| /// equivalence classes: a set containing `a` and `b` and a set containing |
| /// every byte except for `a` and `b`. `a` and `b` are in the same |
| /// equivalence classes because they never discriminate between a match |
| /// and a non-match. |
| /// |
| /// The advantage of this map is that the size of the transition table can |
| /// be reduced drastically from `#states * 256 * sizeof(id)` to |
| /// `#states * k * sizeof(id)` where `k` is the number of equivalence |
| /// classes. As a result, total space usage can decrease substantially. |
| /// Moreover, since a smaller alphabet is used, compilation becomes faster |
| /// as well. |
| /// |
| /// The disadvantage of this map is that every byte searched must be |
| /// passed through this map before it can be used to determine the next |
| /// transition. This has a small match time performance cost. |
| /// |
| /// This option is enabled by default. |
| pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.byte_classes(yes); |
| self |
| } |
| |
| /// Apply best effort heuristics to shrink the NFA at the expense of more |
| /// time/memory. |
| /// |
| /// This may be exposed in the future, but for now is exported for use in |
| /// the `regex-automata-debug` tool. |
| #[doc(hidden)] |
| pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder { |
| self.dfa.shrink(yes); |
| self |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| impl Default for RegexBuilder { |
| fn default() -> RegexBuilder { |
| RegexBuilder::new() |
| } |
| } |