| // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use std::mem; |
| |
| use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; |
| use memchr::{memchr, memchr2, memchr3}; |
| use syntax; |
| |
| use freqs::BYTE_FREQUENCIES; |
| |
| use simd_accel::teddy128::{Teddy, is_teddy_128_available}; |
| |
| /// A prefix extracted from a compiled regular expression. |
| /// |
| /// A regex prefix is a set of literal strings that *must* be matched at the |
| /// beginning of a regex in order for the entire regex to match. |
| /// |
| /// There are a variety of ways to efficiently scan the search text for a |
| /// prefix. Currently, there are three implemented: |
| /// |
| /// 1. The prefix is a single byte. Just use memchr. |
| /// 2. If the prefix is a set of two or more single byte prefixes, then |
| /// a single sparse map is created. Checking if there is a match is a lookup |
| /// in this map for each byte in the search text. |
| /// 3. In all other cases, build an Aho-Corasick automaton. |
| /// |
| /// It's possible that there's room here for other substring algorithms, |
| /// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp |
| /// for small sets of same-length prefixes. |
| #[derive(Clone, Debug)] |
| pub struct LiteralSearcher { |
| complete: bool, |
| lcp: SingleSearch, |
| lcs: SingleSearch, |
| matcher: Matcher, |
| } |
| |
| #[derive(Clone, Debug)] |
| enum Matcher { |
| /// No literals. (Never advances through the input.) |
| Empty, |
| /// A set of four or more single byte literals. |
| Bytes(SingleByteSet), |
| /// A single substring. (Likely using Boyer-Moore with memchr.) |
| Single(SingleSearch), |
| /// An Aho-Corasick automaton. |
| AC(FullAcAutomaton<syntax::Lit>), |
| /// A simd accelerated multiple string matcher. |
| Teddy128(Teddy), |
| } |
| |
| impl LiteralSearcher { |
| /// Returns a matcher that never matches and never advances the input. |
| pub fn empty() -> Self { |
| Self::new(syntax::Literals::empty(), Matcher::Empty) |
| } |
| |
| /// Returns a matcher for literal prefixes from the given set. |
| pub fn prefixes(lits: syntax::Literals) -> Self { |
| let matcher = Matcher::prefixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| /// Returns a matcher for literal suffixes from the given set. |
| pub fn suffixes(lits: syntax::Literals) -> Self { |
| let matcher = Matcher::suffixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| fn new(lits: syntax::Literals, matcher: Matcher) -> Self { |
| let complete = lits.all_complete(); |
| LiteralSearcher { |
| complete: complete, |
| lcp: SingleSearch::new(lits.longest_common_prefix().to_vec()), |
| lcs: SingleSearch::new(lits.longest_common_suffix().to_vec()), |
| matcher: matcher, |
| } |
| } |
| |
| /// Returns true if all matches comprise the entire regular expression. |
| /// |
| /// This does not necessarily mean that a literal match implies a match |
| /// of the regular expression. For example, the regular expresison `^a` |
| /// is comprised of a single complete literal `a`, but the regular |
| /// expression demands that it only match at the beginning of a string. |
| pub fn complete(&self) -> bool { |
| self.complete && self.len() > 0 |
| } |
| |
| /// Find the position of a literal in `haystack` if it exists. |
| #[inline(always)] // reduces constant overhead |
| pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => Some((0, 0)), |
| Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), |
| Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
| AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)), |
| Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)), |
| } |
| } |
| |
| /// Like find, except matches must start at index `0`. |
| pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[0..lit.len()] { |
| return Some((0, lit.len())); |
| } |
| } |
| None |
| } |
| |
| /// Like find, except matches must end at index `haystack.len()`. |
| pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[haystack.len() - lit.len()..] { |
| return Some((haystack.len() - lit.len(), haystack.len())); |
| } |
| } |
| None |
| } |
| |
| /// Returns an iterator over all literals to be matched. |
| pub fn iter(&self) -> LiteralIter { |
| match self.matcher { |
| Matcher::Empty => LiteralIter::Empty, |
| Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), |
| Matcher::Single(ref s) => LiteralIter::Single(&s.pat), |
| Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()), |
| Matcher::Teddy128(ref ted) => { |
| LiteralIter::Teddy128(ted.patterns()) |
| } |
| } |
| } |
| |
| /// Returns a matcher for the longest common prefix of this matcher. |
| pub fn lcp(&self) -> &SingleSearch { |
| &self.lcp |
| } |
| |
| /// Returns a matcher for the longest common suffix of this matcher. |
| pub fn lcs(&self) -> &SingleSearch { |
| &self.lcs |
| } |
| |
| /// Returns true iff this prefix is empty. |
| pub fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| /// Returns the number of prefixes in this machine. |
| pub fn len(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.dense.len(), |
| Single(_) => 1, |
| AC(ref aut) => aut.len(), |
| Teddy128(ref ted) => ted.len(), |
| } |
| } |
| |
| /// Return the approximate heap usage of literals in bytes. |
| pub fn approximate_size(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.approximate_size(), |
| Single(ref single) => single.approximate_size(), |
| AC(ref aut) => aut.heap_bytes(), |
| Teddy128(ref ted) => ted.approximate_size(), |
| } |
| } |
| } |
| |
| impl Matcher { |
| fn prefixes(lits: &syntax::Literals) -> Self { |
| let sset = SingleByteSet::prefixes(&lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn suffixes(lits: &syntax::Literals) -> Self { |
| let sset = SingleByteSet::suffixes(&lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self { |
| if lits.literals().is_empty() { |
| return Matcher::Empty; |
| } |
| if sset.dense.len() >= 26 { |
| // Avoid trying to match a large number of single bytes. |
| // This is *very* sensitive to a frequency analysis comparison |
| // between the bytes in sset and the composition of the haystack. |
| // No matter the size of sset, if its members all are rare in the |
| // haystack, then it'd be worth using it. How to tune this... IDK. |
| // ---AG |
| return Matcher::Empty; |
| } |
| if sset.complete { |
| return Matcher::Bytes(sset); |
| } |
| if lits.literals().len() == 1 { |
| let lit = lits.literals()[0].to_vec(); |
| return Matcher::Single(SingleSearch::new(lit)); |
| } |
| let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii; |
| if is_teddy_128_available() && !is_aho_corasick_fast { |
| // Only try Teddy if Aho-Corasick can't use memchr on an ASCII |
| // byte. Also, in its current form, Teddy doesn't scale well to |
| // lots of literals. |
| // |
| // We impose the ASCII restriction since an alternation of |
| // non-ASCII string literals in the same language is likely to all |
| // start with the same byte. Even worse, the corpus being searched |
| // probably has a similar composition, which ends up completely |
| // negating the benefit of memchr. |
| const MAX_TEDDY_LITERALS: usize = 32; |
| if lits.literals().len() <= MAX_TEDDY_LITERALS { |
| if let Some(ted) = Teddy::new(lits) { |
| return Matcher::Teddy128(ted); |
| } |
| } |
| // Fallthrough to ol' reliable Aho-Corasick... |
| } |
| let pats = lits.literals().to_owned(); |
| Matcher::AC(AcAutomaton::new(pats).into_full()) |
| } |
| } |
| |
| pub enum LiteralIter<'a> { |
| Empty, |
| Bytes(&'a [u8]), |
| Single(&'a [u8]), |
| AC(&'a [syntax::Lit]), |
| Teddy128(&'a [Vec<u8>]), |
| } |
| |
| impl<'a> Iterator for LiteralIter<'a> { |
| type Item = &'a [u8]; |
| |
| fn next(&mut self) -> Option<Self::Item> { |
| match *self { |
| LiteralIter::Empty => None, |
| LiteralIter::Bytes(ref mut many) => { |
| if many.is_empty() { |
| None |
| } else { |
| let next = &many[0..1]; |
| *many = &many[1..]; |
| Some(next) |
| } |
| } |
| LiteralIter::Single(ref mut one) => { |
| if one.is_empty() { |
| None |
| } else { |
| let next = &one[..]; |
| *one = &[]; |
| Some(next) |
| } |
| } |
| LiteralIter::AC(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| LiteralIter::Teddy128(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| } |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| struct SingleByteSet { |
| sparse: Vec<bool>, |
| dense: Vec<u8>, |
| complete: bool, |
| all_ascii: bool, |
| } |
| |
| impl SingleByteSet { |
| fn new() -> SingleByteSet { |
| SingleByteSet { |
| sparse: vec![false; 256], |
| dense: vec![], |
| complete: true, |
| all_ascii: true, |
| } |
| } |
| |
| fn prefixes(lits: &syntax::Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(0) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| fn suffixes(lits: &syntax::Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| /// Faster find that special cases certain sizes to use memchr. |
| #[inline(always)] // reduces constant overhead |
| fn find(&self, text: &[u8]) -> Option<usize> { |
| match self.dense.len() { |
| 0 => None, |
| 1 => memchr(self.dense[0], text), |
| 2 => memchr2(self.dense[0], self.dense[1], text), |
| 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), |
| _ => self._find(text), |
| } |
| } |
| |
| /// Generic find that works on any sized set. |
| fn _find(&self, haystack: &[u8]) -> Option<usize> { |
| for (i, &b) in haystack.iter().enumerate() { |
| if self.sparse[b as usize] { |
| return Some(i); |
| } |
| } |
| None |
| } |
| |
| fn approximate_size(&self) -> usize { |
| (self.dense.len() * mem::size_of::<u8>()) |
| + (self.sparse.len() * mem::size_of::<bool>()) |
| } |
| } |
| |
| /// Provides an implementation of fast subtring search. |
| /// |
| /// This particular implementation is a Boyer-Moore variant, based on the |
| /// "tuned boyer moore" search from (Hume & Sunday, 1991). It has been tweaked |
| /// slightly to better use memchr. |
| /// |
| /// memchr is so fast that we do everything we can to keep the loop in memchr |
| /// for as long as possible. The easiest way to do this is to intelligently |
| /// pick the byte to send to memchr. The best byte is the byte that occurs |
| /// least frequently in the haystack. Since doing frequency analysis on the |
| /// haystack is far too expensive, we compute a set of fixed frequencies up |
| /// front and hard code them in src/freqs.rs. Frequency analysis is done via |
| /// scripts/frequencies.py. |
| /// |
| /// TODO(burntsushi): Add some amount of shifting to this. |
| #[derive(Clone, Debug)] |
| pub struct SingleSearch { |
| /// The pattern. |
| pat: Vec<u8>, |
| /// The number of Unicode characters in the pattern. This is useful for |
| /// determining the effective length of a pattern when deciding which |
| /// optimizations to perform. A trailing incomplete UTF-8 sequence counts |
| /// as one character. |
| char_len: usize, |
| /// The rarest byte in the pattern, according to pre-computed frequency |
| /// analysis. |
| rare1: u8, |
| /// The offset of the rarest byte in `pat`. |
| rare1i: usize, |
| /// The second rarest byte in the pattern, according to pre-computed |
| /// frequency analysis. (This may be equivalent to the rarest byte.) |
| /// |
| /// The second rarest byte is used as a type of guard for quickly detecting |
| /// a mismatch after memchr locates an instance of the rarest byte. This |
| /// is a hedge against pathological cases where the pre-computed frequency |
| /// analysis may be off. (But of course, does not prevent *all* |
| /// pathological cases.) |
| rare2: u8, |
| /// The offset of the second rarest byte in `pat`. |
| rare2i: usize, |
| } |
| |
| impl SingleSearch { |
| fn new(pat: Vec<u8>) -> SingleSearch { |
| fn freq_rank(b: u8) -> usize { BYTE_FREQUENCIES[b as usize] as usize } |
| |
| if pat.is_empty() { |
| return SingleSearch::empty(); |
| } |
| |
| // Find the rarest two bytes. Try to make them distinct (but it's not |
| // required). |
| let mut rare1 = pat[0]; |
| let mut rare2 = pat[0]; |
| for b in pat[1..].iter().cloned() { |
| if freq_rank(b) < freq_rank(rare1) { |
| rare1 = b; |
| } |
| } |
| for &b in &pat { |
| if rare1 == rare2 { |
| rare2 = b |
| } else if b != rare1 && freq_rank(b) < freq_rank(rare2) { |
| rare2 = b; |
| } |
| } |
| |
| // And find the offsets of their last occurrences. |
| let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap(); |
| let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); |
| |
| let char_len = char_len_lossy(&pat); |
| SingleSearch { |
| pat: pat, |
| char_len: char_len, |
| rare1: rare1, |
| rare1i: rare1i, |
| rare2: rare2, |
| rare2i: rare2i, |
| } |
| } |
| |
| fn empty() -> SingleSearch { |
| SingleSearch { |
| pat: vec![], |
| char_len: 0, |
| rare1: 0, |
| rare1i: 0, |
| rare2: 0, |
| rare2i: 0, |
| } |
| } |
| |
| #[inline(always)] // reduces constant overhead |
| pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
| let pat = &*self.pat; |
| if haystack.len() < pat.len() || pat.is_empty() { |
| return None; |
| } |
| let mut i = self.rare1i; |
| while i < haystack.len() { |
| i += match memchr(self.rare1, &haystack[i..]) { |
| None => return None, |
| Some(i) => i, |
| }; |
| let start = i - self.rare1i; |
| let end = start + pat.len(); |
| if end > haystack.len() { |
| return None; |
| } |
| let aligned = &haystack[start..end]; |
| if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat { |
| return Some(start); |
| } |
| i += 1; |
| } |
| None |
| } |
| |
| #[inline(always)] // reduces constant overhead |
| pub fn is_suffix(&self, text: &[u8]) -> bool { |
| if text.len() < self.len() { |
| return false; |
| } |
| &text[text.len() - self.len()..] == &*self.pat |
| } |
| |
| pub fn len(&self) -> usize { |
| self.pat.len() |
| } |
| |
| pub fn char_len(&self) -> usize { |
| self.char_len |
| } |
| |
| fn approximate_size(&self) -> usize { |
| self.pat.len() * mem::size_of::<u8>() |
| } |
| } |
| |
| fn char_len_lossy(bytes: &[u8]) -> usize { |
| String::from_utf8_lossy(bytes).chars().count() |
| } |