android/vendor/regex-1.10.4/src/regexset/bytes.rs - toolchain/cargo-deny - Git at Google

 use alloc::string::String;

 use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};

 use crate::{bytes::RegexSetBuilder, Error};

 /// Match multiple, possibly overlapping, regexes in a single search.
 ///
 /// A regex set corresponds to the union of zero or more regular expressions.
 /// That is, a regex set will match a haystack when at least one of its
 /// constituent regexes matches. A regex set as its formulated here provides a
 /// touch more power: it will also report *which* regular expressions in the
 /// set match. Indeed, this is the key difference between regex sets and a
 /// single `Regex` with many alternates, since only one alternate can match at
 /// a time.
 ///
 /// For example, consider regular expressions to match email addresses and
 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
 /// regex set is constructed from those regexes, then searching the haystack
 /// `[email protected]` will report both regexes as matching. Of course, one
 /// could accomplish this by compiling each regex on its own and doing two
 /// searches over the haystack. The key advantage of using a regex set is
 /// that it will report the matching regexes using a *single pass through the
 /// haystack*. If one has hundreds or thousands of regexes to match repeatedly
 /// (like a URL router for a complex web application or a user agent matcher),
 /// then a regex set *can* realize huge performance gains.
 ///
 /// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet`
 /// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this
 /// `RegexSet` is permitted to match invalid UTF-8.
 ///
 /// # Limitations
 ///
 /// Regex sets are limited to answering the following two questions:
 ///
 /// 1. Does any regex in the set match?
 /// 2. If so, which regexes in the set match?
 ///
 /// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask
 /// (1) instead of (2) since the matching engines can stop after the first
 /// match is found.
 ///
 /// You cannot directly extract [`Match`][crate::bytes::Match] or
 /// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need
 /// these operations, the recommended approach is to compile each pattern in
 /// the set independently and scan the exact same haystack a second time with
 /// those independently compiled patterns:
 ///
 /// ```
 /// use regex::bytes::{Regex, RegexSet};
 ///
 /// let patterns = ["foo", "bar"];
 /// // Both patterns will match different ranges of this string.
 /// let hay = b"barfoo";
 ///
 /// // Compile a set matching any of our patterns.
 /// let set = RegexSet::new(patterns).unwrap();
 /// // Compile each pattern independently.
 /// let regexes: Vec<_> = set
 ///     .patterns()
 ///     .iter()
 ///     .map(|pat| Regex::new(pat).unwrap())
 ///     .collect();
 ///
 /// // Match against the whole set first and identify the individual
 /// // matching patterns.
 /// let matches: Vec<&[u8]> = set
 ///     .matches(hay)
 ///     .into_iter()
 ///     // Dereference the match index to get the corresponding
 ///     // compiled pattern.
 ///     .map(|index| &regexes[index])
 ///     // To get match locations or any other info, we then have to search the
 ///     // exact same haystack again, using our separately-compiled pattern.
 ///     .map(|re| re.find(hay).unwrap().as_bytes())
 ///     .collect();
 ///
 /// // Matches arrive in the order the constituent patterns were declared,
 /// // not the order they appear in the haystack.
 /// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches);
 /// ```
 ///
 /// # Performance
 ///
 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
 /// search takes `O(m * n)` time, where `m` is proportional to the size of the
 /// regex set and `n` is proportional to the length of the haystack.
 ///
 /// # Trait implementations
 ///
 /// The `Default` trait is implemented for `RegexSet`. The default value
 /// is an empty set. An empty set can also be explicitly constructed via
 /// [`RegexSet::empty`].
 ///
 /// # Example
 ///
 /// This shows how the above two regexes (for matching email addresses and
 /// domains) might work:
 ///
 /// ```
 /// use regex::bytes::RegexSet;
 ///
 /// let set = RegexSet::new(&[
 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
 ///     r"[a-z]+\.(com|org|net)",
 /// ]).unwrap();
 ///
 /// // Ask whether any regexes in the set match.
 /// assert!(set.is_match(b"[email protected]"));
 ///
 /// // Identify which regexes in the set match.
 /// let matches: Vec<_> = set.matches(b"[email protected]").into_iter().collect();
 /// assert_eq!(vec![0, 1], matches);
 ///
 /// // Try again, but with a haystack that only matches one of the regexes.
 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
 /// assert_eq!(vec![1], matches);
 ///
 /// // Try again, but with a haystack that doesn't match any regex in the set.
 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
 /// assert!(matches.is_empty());
 /// ```
 ///
 /// Note that it would be possible to adapt the above example to using `Regex`
 /// with an expression like:
 ///
 /// ```text
 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
 /// ```
 ///
 /// After a match, one could then inspect the capture groups to figure out
 /// which alternates matched. The problem is that it is hard to make this
 /// approach scale when there are many regexes since the overlap between each
 /// alternate isn't always obvious to reason about.
 #[derive(Clone)]
 pub struct RegexSet {
     pub(crate) meta: meta::Regex,
     pub(crate) patterns: alloc::sync::Arc<[String]>,
 }

 impl RegexSet {
     /// Create a new regex set with the given regular expressions.
     ///
     /// This takes an iterator of `S`, where `S` is something that can produce
     /// a `&str`. If any of the strings in the iterator are not valid regular
     /// expressions, then an error is returned.
     ///
     /// # Example
     ///
     /// Create a new regex set from an iterator of strings:
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
     /// assert!(set.is_match(b"foo"));
     /// ```
     pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
     where
         S: AsRef<str>,
         I: IntoIterator<Item = S>,
     {
         RegexSetBuilder::new(exprs).build()
     }

     /// Create a new empty regex set.
     ///
     /// An empty regex never matches anything.
     ///
     /// This is a convenience function for `RegexSet::new([])`, but doesn't
     /// require one to specify the type of the input.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::empty();
     /// assert!(set.is_empty());
     /// // an empty set matches nothing
     /// assert!(!set.is_match(b""));
     /// ```
     pub fn empty() -> RegexSet {
         let empty: [&str; 0] = [];
         RegexSetBuilder::new(empty).build().unwrap()
     }

     /// Returns true if and only if one of the regexes in this set matches
     /// the haystack given.
     ///
     /// This method should be preferred if you only need to test whether any
     /// of the regexes in the set should match, but don't care about *which*
     /// regexes matched. This is because the underlying matching engine will
     /// quit immediately after seeing the first match instead of continuing to
     /// find all matches.
     ///
     /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
     /// expression is unanchored by default. That is, if the regex does not
     /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
     /// to match anywhere in the haystack.
     ///
     /// # Example
     ///
     /// Tests whether a set matches somewhere in a haystack:
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
     /// assert!(set.is_match(b"foo"));
     /// assert!(!set.is_match("☃".as_bytes()));
     /// ```
     #[inline]
     pub fn is_match(&self, haystack: &[u8]) -> bool {
         self.is_match_at(haystack, 0)
     }

     /// Returns true if and only if one of the regexes in this set matches the
     /// haystack given, with the search starting at the offset given.
     ///
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
     ///
     /// # Panics
     ///
     /// This panics when `start >= haystack.len() + 1`.
     ///
     /// # Example
     ///
     /// This example shows the significance of `start`. Namely, consider a
     /// haystack `foobar` and a desire to execute a search starting at offset
     /// `3`. You could search a substring explicitly, but then the look-around
     /// assertions won't work correctly. Instead, you can use this method to
     /// specify the start position of a search.
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
     /// let hay = b"foobar";
     /// // We get a match here, but it's probably not intended.
     /// assert!(set.is_match(&hay[3..]));
     /// // No match because the  assertions take the context into account.
     /// assert!(!set.is_match_at(hay, 3));
     /// ```
     #[inline]
     pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
         self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
     }

     /// Returns the set of regexes that match in the given haystack.
     ///
     /// The set returned contains the index of each regex that matches in
     /// the given haystack. The index is in correspondence with the order of
     /// regular expressions given to `RegexSet`'s constructor.
     ///
     /// The set can also be used to iterate over the matched indices. The order
     /// of iteration is always ascending with respect to the matching indices.
     ///
     /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
     /// expression is unanchored by default. That is, if the regex does not
     /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
     /// to match anywhere in the haystack.
     ///
     /// # Example
     ///
     /// Tests which regular expressions match the given haystack:
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([
     ///     r"\w+",
     ///     r"\d+",
     ///     r"\pL+",
     ///     r"foo",
     ///     r"bar",
     ///     r"barfoo",
     ///     r"foobar",
     /// ]).unwrap();
     /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect();
     /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
     ///
     /// // You can also test whether a particular regex matched:
     /// let matches = set.matches(b"foobar");
     /// assert!(!matches.matched(5));
     /// assert!(matches.matched(6));
     /// ```
     #[inline]
     pub fn matches(&self, haystack: &[u8]) -> SetMatches {
         self.matches_at(haystack, 0)
     }

     /// Returns the set of regexes that match in the given haystack.
     ///
     /// The set returned contains the index of each regex that matches in
     /// the given haystack. The index is in correspondence with the order of
     /// regular expressions given to `RegexSet`'s constructor.
     ///
     /// The set can also be used to iterate over the matched indices. The order
     /// of iteration is always ascending with respect to the matching indices.
     ///
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
     ///
     /// # Panics
     ///
     /// This panics when `start >= haystack.len() + 1`.
     ///
     /// # Example
     ///
     /// Tests which regular expressions match the given haystack:
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
     /// let hay = b"foobar";
     /// // We get matches here, but it's probably not intended.
     /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();
     /// assert_eq!(matches, vec![0, 1]);
     /// // No matches because the  assertions take the context into account.
     /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();
     /// assert_eq!(matches, vec![]);
     /// ```
     #[inline]
     pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches {
         let input = Input::new(haystack).span(start..haystack.len());
         let mut patset = PatternSet::new(self.meta.pattern_len());
         self.meta.which_overlapping_matches(&input, &mut patset);
         SetMatches(patset)
     }

     /// Returns the same as matches, but starts the search at the given
     /// offset and stores the matches into the slice given.
     ///
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
     ///
     /// `matches` must have a length that is at least the number of regexes
     /// in this set.
     ///
     /// This method returns true if and only if at least one member of
     /// `matches` is true after executing the set against `haystack`.
     #[doc(hidden)]
     #[inline]
     pub fn matches_read_at(
         &self,
         matches: &mut [bool],
         haystack: &[u8],
         start: usize,
     ) -> bool {
         // This is pretty dumb. We should try to fix this, but the
         // regex-automata API doesn't provide a way to store matches in an
         // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and
         // thus not public... But regex-capi currently uses it. We should
         // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet
         // is in regex-automata, not regex. So maybe we should just accept a
         // 'SetMatches', which is basically just a newtype around PatternSet.
         let mut patset = PatternSet::new(self.meta.pattern_len());
         let mut input = Input::new(haystack);
         input.set_start(start);
         self.meta.which_overlapping_matches(&input, &mut patset);
         for pid in patset.iter() {
             matches[pid] = true;
         }
         !patset.is_empty()
     }

     /// An alias for `matches_read_at` to preserve backward compatibility.
     ///
     /// The `regex-capi` crate used this method, so to avoid breaking that
     /// crate, we continue to export it as an undocumented API.
     #[doc(hidden)]
     #[inline]
     pub fn read_matches_at(
         &self,
         matches: &mut [bool],
         haystack: &[u8],
         start: usize,
     ) -> bool {
         self.matches_read_at(matches, haystack, start)
     }

     /// Returns the total number of regexes in this set.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// assert_eq!(0, RegexSet::empty().len());
     /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());
     /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());
     /// ```
     #[inline]
     pub fn len(&self) -> usize {
         self.meta.pattern_len()
     }

     /// Returns `true` if this set contains no regexes.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// assert!(RegexSet::empty().is_empty());
     /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());
     /// ```
     #[inline]
     pub fn is_empty(&self) -> bool {
         self.meta.pattern_len() == 0
     }

     /// Returns the regex patterns that this regex set was constructed from.
     ///
     /// This function can be used to determine the pattern for a match. The
     /// slice returned has exactly as many patterns givens to this regex set,
     /// and the order of the slice is the same as the order of the patterns
     /// provided to the set.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new(&[
     ///     r"\w+",
     ///     r"\d+",
     ///     r"\pL+",
     ///     r"foo",
     ///     r"bar",
     ///     r"barfoo",
     ///     r"foobar",
     /// ]).unwrap();
     /// let matches: Vec<_> = set
     ///     .matches(b"foobar")
     ///     .into_iter()
     ///     .map(|index| &set.patterns()[index])
     ///     .collect();
     /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
     /// ```
     #[inline]
     pub fn patterns(&self) -> &[String] {
         &self.patterns
     }
 }

 impl Default for RegexSet {
     fn default() -> Self {
         RegexSet::empty()
     }
 }

 /// A set of matches returned by a regex set.
 ///
 /// Values of this type are constructed by [`RegexSet::matches`].
 #[derive(Clone, Debug)]
 pub struct SetMatches(PatternSet);

 impl SetMatches {
     /// Whether this set contains any matches.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new(&[
     ///     r"[a-z]+@[a-z]+\.(com|org|net)",
     ///     r"[a-z]+\.(com|org|net)",
     /// ]).unwrap();
     /// let matches = set.matches(b"[email protected]");
     /// assert!(matches.matched_any());
     /// ```
     #[inline]
     pub fn matched_any(&self) -> bool {
         !self.0.is_empty()
     }

     /// Whether the regex at the given index matched.
     ///
     /// The index for a regex is determined by its insertion order upon the
     /// initial construction of a `RegexSet`, starting at `0`.
     ///
     /// # Panics
     ///
     /// If `index` is greater than or equal to the number of regexes in the
     /// original set that produced these matches. Equivalently, when `index`
     /// is greater than or equal to [`SetMatches::len`].
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([
     ///     r"[a-z]+@[a-z]+\.(com|org|net)",
     ///     r"[a-z]+\.(com|org|net)",
     /// ]).unwrap();
     /// let matches = set.matches(b"example.com");
     /// assert!(!matches.matched(0));
     /// assert!(matches.matched(1));
     /// ```
     #[inline]
     pub fn matched(&self, index: usize) -> bool {
         self.0.contains(PatternID::new_unchecked(index))
     }

     /// The total number of regexes in the set that created these matches.
     ///
     /// **WARNING:** This always returns the same value as [`RegexSet::len`].
     /// In particular, it does *not* return the number of elements yielded by
     /// [`SetMatches::iter`]. The only way to determine the total number of
     /// matched regexes is to iterate over them.
     ///
     /// # Example
     ///
     /// Notice that this method returns the total number of regexes in the
     /// original set, and *not* the total number of regexes that matched.
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([
     ///     r"[a-z]+@[a-z]+\.(com|org|net)",
     ///     r"[a-z]+\.(com|org|net)",
     /// ]).unwrap();
     /// let matches = set.matches(b"example.com");
     /// // Total number of patterns that matched.
     /// assert_eq!(1, matches.iter().count());
     /// // Total number of patterns in the set.
     /// assert_eq!(2, matches.len());
     /// ```
     #[inline]
     pub fn len(&self) -> usize {
         self.0.capacity()
     }

     /// Returns an iterator over the indices of the regexes that matched.
     ///
     /// This will always produces matches in ascending order, where the index
     /// yielded corresponds to the index of the regex that matched with respect
     /// to its position when initially building the set.
     ///
     /// # Example
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([
     ///     r"[0-9]",
     ///     r"[a-z]",
     ///     r"[A-Z]",
     ///     r"\p{Greek}",
     /// ]).unwrap();
     /// let hay = "βa1".as_bytes();
     /// let matches: Vec<_> = set.matches(hay).iter().collect();
     /// assert_eq!(matches, vec![0, 1, 3]);
     /// ```
     ///
     /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so
     /// this method is not always needed. For example:
     ///
     /// ```
     /// use regex::bytes::RegexSet;
     ///
     /// let set = RegexSet::new([
     ///     r"[0-9]",
     ///     r"[a-z]",
     ///     r"[A-Z]",
     ///     r"\p{Greek}",
     /// ]).unwrap();
     /// let hay = "βa1".as_bytes();
     /// let mut matches = vec![];
     /// for index in set.matches(hay) {
     ///     matches.push(index);
     /// }
     /// assert_eq!(matches, vec![0, 1, 3]);
     /// ```
     #[inline]
     pub fn iter(&self) -> SetMatchesIter<'_> {
         SetMatchesIter(self.0.iter())
     }
 }

 impl IntoIterator for SetMatches {
     type IntoIter = SetMatchesIntoIter;
     type Item = usize;

     fn into_iter(self) -> Self::IntoIter {
         let it = 0..self.0.capacity();
         SetMatchesIntoIter { patset: self.0, it }
     }
 }

 impl<'a> IntoIterator for &'a SetMatches {
     type IntoIter = SetMatchesIter<'a>;
     type Item = usize;

     fn into_iter(self) -> Self::IntoIter {
         self.iter()
     }
 }

 /// An owned iterator over the set of matches from a regex set.
 ///
 /// This will always produces matches in ascending order of index, where the
 /// index corresponds to the index of the regex that matched with respect to
 /// its position when initially building the set.
 ///
 /// This iterator is created by calling `SetMatches::into_iter` via the
 /// `IntoIterator` trait. This is automatically done in `for` loops.
 ///
 /// # Example
 ///
 /// ```
 /// use regex::bytes::RegexSet;
 ///
 /// let set = RegexSet::new([
 ///     r"[0-9]",
 ///     r"[a-z]",
 ///     r"[A-Z]",
 ///     r"\p{Greek}",
 /// ]).unwrap();
 /// let hay = "βa1".as_bytes();
 /// let mut matches = vec![];
 /// for index in set.matches(hay) {
 ///     matches.push(index);
 /// }
 /// assert_eq!(matches, vec![0, 1, 3]);
 /// ```
 #[derive(Debug)]
 pub struct SetMatchesIntoIter {
     patset: PatternSet,
     it: core::ops::Range<usize>,
 }

 impl Iterator for SetMatchesIntoIter {
     type Item = usize;

     fn next(&mut self) -> Option<usize> {
         loop {
             let id = self.it.next()?;
             if self.patset.contains(PatternID::new_unchecked(id)) {
                 return Some(id);
             }
         }
     }

     fn size_hint(&self) -> (usize, Option<usize>) {
         self.it.size_hint()
     }
 }

 impl DoubleEndedIterator for SetMatchesIntoIter {
     fn next_back(&mut self) -> Option<usize> {
         loop {
             let id = self.it.next_back()?;
             if self.patset.contains(PatternID::new_unchecked(id)) {
                 return Some(id);
             }
         }
     }
 }

 impl core::iter::FusedIterator for SetMatchesIntoIter {}

 /// A borrowed iterator over the set of matches from a regex set.
 ///
 /// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that
 /// created this iterator.
 ///
 /// This will always produces matches in ascending order, where the index
 /// corresponds to the index of the regex that matched with respect to its
 /// position when initially building the set.
 ///
 /// This iterator is created by the [`SetMatches::iter`] method.
 #[derive(Clone, Debug)]
 pub struct SetMatchesIter<'a>(PatternSetIter<'a>);

 impl<'a> Iterator for SetMatchesIter<'a> {
     type Item = usize;

     fn next(&mut self) -> Option<usize> {
         self.0.next().map(|pid| pid.as_usize())
     }

     fn size_hint(&self) -> (usize, Option<usize>) {
         self.0.size_hint()
     }
 }

 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
     fn next_back(&mut self) -> Option<usize> {
         self.0.next_back().map(|pid| pid.as_usize())
     }
 }

 impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}

 impl core::fmt::Debug for RegexSet {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "RegexSet({:?})", self.patterns())
     }
 }