| // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use std::cmp; |
| use std::mem; |
| |
| use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; |
| use memchr::{memchr, memchr2, memchr3}; |
| use syntax::hir::literal::{Literal, Literals}; |
| |
| use freqs::BYTE_FREQUENCIES; |
| use self::teddy_avx2::{Teddy as TeddyAVX2}; |
| use self::teddy_ssse3::{Teddy as TeddySSSE3}; |
| |
| mod teddy_avx2; |
| mod teddy_ssse3; |
| |
| /// A prefix extracted from a compiled regular expression. |
| /// |
| /// A regex prefix is a set of literal strings that *must* be matched at the |
| /// beginning of a regex in order for the entire regex to match. Similarly |
| /// for a regex suffix. |
| #[derive(Clone, Debug)] |
| pub struct LiteralSearcher { |
| complete: bool, |
| lcp: FreqyPacked, |
| lcs: FreqyPacked, |
| matcher: Matcher, |
| } |
| |
| #[derive(Clone, Debug)] |
| enum Matcher { |
| /// No literals. (Never advances through the input.) |
| Empty, |
| /// A set of four or more single byte literals. |
| Bytes(SingleByteSet), |
| /// A single substring, find using memchr and frequency analysis. |
| FreqyPacked(FreqyPacked), |
| /// A single substring, find using Boyer-Moore. |
| BoyerMoore(BoyerMooreSearch), |
| /// An Aho-Corasick automaton. |
| AC(FullAcAutomaton<Literal>), |
| /// A simd accelerated multiple string matcher. Used only for a small |
| /// number of small literals. |
| TeddySSSE3(TeddySSSE3), |
| /// A simd accelerated multiple string matcher. Used only for a small |
| /// number of small literals. This uses 256-bit vectors. |
| TeddyAVX2(TeddyAVX2), |
| } |
| |
| impl LiteralSearcher { |
| /// Returns a matcher that never matches and never advances the input. |
| pub fn empty() -> Self { |
| Self::new(Literals::empty(), Matcher::Empty) |
| } |
| |
| /// Returns a matcher for literal prefixes from the given set. |
| pub fn prefixes(lits: Literals) -> Self { |
| let matcher = Matcher::prefixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| /// Returns a matcher for literal suffixes from the given set. |
| pub fn suffixes(lits: Literals) -> Self { |
| let matcher = Matcher::suffixes(&lits); |
| Self::new(lits, matcher) |
| } |
| |
| fn new(lits: Literals, matcher: Matcher) -> Self { |
| let complete = lits.all_complete(); |
| LiteralSearcher { |
| complete: complete, |
| lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()), |
| lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()), |
| matcher: matcher, |
| } |
| } |
| |
| /// Returns true if all matches comprise the entire regular expression. |
| /// |
| /// This does not necessarily mean that a literal match implies a match |
| /// of the regular expression. For example, the regular expresison `^a` |
| /// is comprised of a single complete literal `a`, but the regular |
| /// expression demands that it only match at the beginning of a string. |
| pub fn complete(&self) -> bool { |
| self.complete && !self.is_empty() |
| } |
| |
| /// Find the position of a literal in `haystack` if it exists. |
| #[inline(always)] // reduces constant overhead |
| pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => Some((0, 0)), |
| Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), |
| FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
| BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
| AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)), |
| TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)), |
| TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)), |
| } |
| } |
| |
| /// Like find, except matches must start at index `0`. |
| pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[0..lit.len()] { |
| return Some((0, lit.len())); |
| } |
| } |
| None |
| } |
| |
| /// Like find, except matches must end at index `haystack.len()`. |
| pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
| for lit in self.iter() { |
| if lit.len() > haystack.len() { |
| continue; |
| } |
| if lit == &haystack[haystack.len() - lit.len()..] { |
| return Some((haystack.len() - lit.len(), haystack.len())); |
| } |
| } |
| None |
| } |
| |
| /// Returns an iterator over all literals to be matched. |
| pub fn iter(&self) -> LiteralIter { |
| match self.matcher { |
| Matcher::Empty => LiteralIter::Empty, |
| Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), |
| Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat), |
| Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern), |
| Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()), |
| Matcher::TeddySSSE3(ref ted) => { |
| LiteralIter::TeddySSSE3(ted.patterns()) |
| } |
| Matcher::TeddyAVX2(ref ted) => { |
| LiteralIter::TeddyAVX2(ted.patterns()) |
| } |
| } |
| } |
| |
| /// Returns a matcher for the longest common prefix of this matcher. |
| pub fn lcp(&self) -> &FreqyPacked { |
| &self.lcp |
| } |
| |
| /// Returns a matcher for the longest common suffix of this matcher. |
| pub fn lcs(&self) -> &FreqyPacked { |
| &self.lcs |
| } |
| |
| /// Returns true iff this prefix is empty. |
| pub fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| |
| /// Returns the number of prefixes in this machine. |
| pub fn len(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.dense.len(), |
| FreqyPacked(_) => 1, |
| BoyerMoore(_) => 1, |
| AC(ref aut) => aut.len(), |
| TeddySSSE3(ref ted) => ted.len(), |
| TeddyAVX2(ref ted) => ted.len(), |
| } |
| } |
| |
| /// Return the approximate heap usage of literals in bytes. |
| pub fn approximate_size(&self) -> usize { |
| use self::Matcher::*; |
| match self.matcher { |
| Empty => 0, |
| Bytes(ref sset) => sset.approximate_size(), |
| FreqyPacked(ref single) => single.approximate_size(), |
| BoyerMoore(ref single) => single.approximate_size(), |
| AC(ref aut) => aut.heap_bytes(), |
| TeddySSSE3(ref ted) => ted.approximate_size(), |
| TeddyAVX2(ref ted) => ted.approximate_size(), |
| } |
| } |
| } |
| |
| impl Matcher { |
| fn prefixes(lits: &Literals) -> Self { |
| let sset = SingleByteSet::prefixes(lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn suffixes(lits: &Literals) -> Self { |
| let sset = SingleByteSet::suffixes(lits); |
| Matcher::new(lits, sset) |
| } |
| |
| fn new(lits: &Literals, sset: SingleByteSet) -> Self { |
| if lits.literals().is_empty() { |
| return Matcher::Empty; |
| } |
| if sset.dense.len() >= 26 { |
| // Avoid trying to match a large number of single bytes. |
| // This is *very* sensitive to a frequency analysis comparison |
| // between the bytes in sset and the composition of the haystack. |
| // No matter the size of sset, if its members all are rare in the |
| // haystack, then it'd be worth using it. How to tune this... IDK. |
| // ---AG |
| return Matcher::Empty; |
| } |
| if sset.complete { |
| return Matcher::Bytes(sset); |
| } |
| if lits.literals().len() == 1 { |
| let lit = lits.literals()[0].to_vec(); |
| if BoyerMooreSearch::should_use(lit.as_slice()) { |
| return Matcher::BoyerMoore(BoyerMooreSearch::new(lit)); |
| } else { |
| return Matcher::FreqyPacked(FreqyPacked::new(lit)); |
| } |
| } |
| let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii; |
| if TeddyAVX2::available() && !is_aho_corasick_fast { |
| const MAX_TEDDY_LITERALS: usize = 32; |
| if lits.literals().len() <= MAX_TEDDY_LITERALS { |
| if let Some(ted) = TeddyAVX2::new(lits) { |
| return Matcher::TeddyAVX2(ted); |
| } |
| } |
| } |
| if TeddySSSE3::available() && !is_aho_corasick_fast { |
| // Only try Teddy if Aho-Corasick can't use memchr on an ASCII |
| // byte. Also, in its current form, Teddy doesn't scale well to |
| // lots of literals. |
| // |
| // We impose the ASCII restriction since an alternation of |
| // non-ASCII string literals in the same language is likely to all |
| // start with the same byte. Even worse, the corpus being searched |
| // probably has a similar composition, which ends up completely |
| // negating the benefit of memchr. |
| const MAX_TEDDY_LITERALS: usize = 32; |
| if lits.literals().len() <= MAX_TEDDY_LITERALS { |
| if let Some(ted) = TeddySSSE3::new(lits) { |
| return Matcher::TeddySSSE3(ted); |
| } |
| } |
| // Fallthrough to ol' reliable Aho-Corasick... |
| } |
| let pats = lits.literals().to_owned(); |
| Matcher::AC(AcAutomaton::new(pats).into_full()) |
| } |
| } |
| |
| pub enum LiteralIter<'a> { |
| Empty, |
| Bytes(&'a [u8]), |
| Single(&'a [u8]), |
| AC(&'a [Literal]), |
| TeddySSSE3(&'a [Vec<u8>]), |
| TeddyAVX2(&'a [Vec<u8>]), |
| } |
| |
| impl<'a> Iterator for LiteralIter<'a> { |
| type Item = &'a [u8]; |
| |
| fn next(&mut self) -> Option<Self::Item> { |
| match *self { |
| LiteralIter::Empty => None, |
| LiteralIter::Bytes(ref mut many) => { |
| if many.is_empty() { |
| None |
| } else { |
| let next = &many[0..1]; |
| *many = &many[1..]; |
| Some(next) |
| } |
| } |
| LiteralIter::Single(ref mut one) => { |
| if one.is_empty() { |
| None |
| } else { |
| let next = &one[..]; |
| *one = &[]; |
| Some(next) |
| } |
| } |
| LiteralIter::AC(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| LiteralIter::TeddySSSE3(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| LiteralIter::TeddyAVX2(ref mut lits) => { |
| if lits.is_empty() { |
| None |
| } else { |
| let next = &lits[0]; |
| *lits = &lits[1..]; |
| Some(&**next) |
| } |
| } |
| } |
| } |
| } |
| |
| #[derive(Clone, Debug)] |
| struct SingleByteSet { |
| sparse: Vec<bool>, |
| dense: Vec<u8>, |
| complete: bool, |
| all_ascii: bool, |
| } |
| |
| impl SingleByteSet { |
| fn new() -> SingleByteSet { |
| SingleByteSet { |
| sparse: vec![false; 256], |
| dense: vec![], |
| complete: true, |
| all_ascii: true, |
| } |
| } |
| |
| fn prefixes(lits: &Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(0) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| fn suffixes(lits: &Literals) -> SingleByteSet { |
| let mut sset = SingleByteSet::new(); |
| for lit in lits.literals() { |
| sset.complete = sset.complete && lit.len() == 1; |
| if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { |
| if !sset.sparse[b as usize] { |
| if b > 0x7F { |
| sset.all_ascii = false; |
| } |
| sset.dense.push(b); |
| sset.sparse[b as usize] = true; |
| } |
| } |
| } |
| sset |
| } |
| |
| /// Faster find that special cases certain sizes to use memchr. |
| #[inline(always)] // reduces constant overhead |
| fn find(&self, text: &[u8]) -> Option<usize> { |
| match self.dense.len() { |
| 0 => None, |
| 1 => memchr(self.dense[0], text), |
| 2 => memchr2(self.dense[0], self.dense[1], text), |
| 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), |
| _ => self._find(text), |
| } |
| } |
| |
| /// Generic find that works on any sized set. |
| fn _find(&self, haystack: &[u8]) -> Option<usize> { |
| for (i, &b) in haystack.iter().enumerate() { |
| if self.sparse[b as usize] { |
| return Some(i); |
| } |
| } |
| None |
| } |
| |
| fn approximate_size(&self) -> usize { |
| (self.dense.len() * mem::size_of::<u8>()) |
| + (self.sparse.len() * mem::size_of::<bool>()) |
| } |
| } |
| |
| /// Provides an implementation of fast subtring search using frequency |
| /// analysis. |
| /// |
| /// memchr is so fast that we do everything we can to keep the loop in memchr |
| /// for as long as possible. The easiest way to do this is to intelligently |
| /// pick the byte to send to memchr. The best byte is the byte that occurs |
| /// least frequently in the haystack. Since doing frequency analysis on the |
| /// haystack is far too expensive, we compute a set of fixed frequencies up |
| /// front and hard code them in src/freqs.rs. Frequency analysis is done via |
| /// scripts/frequencies.py. |
| #[derive(Clone, Debug)] |
| pub struct FreqyPacked { |
| /// The pattern. |
| pat: Vec<u8>, |
| /// The number of Unicode characters in the pattern. This is useful for |
| /// determining the effective length of a pattern when deciding which |
| /// optimizations to perform. A trailing incomplete UTF-8 sequence counts |
| /// as one character. |
| char_len: usize, |
| /// The rarest byte in the pattern, according to pre-computed frequency |
| /// analysis. |
| rare1: u8, |
| /// The offset of the rarest byte in `pat`. |
| rare1i: usize, |
| /// The second rarest byte in the pattern, according to pre-computed |
| /// frequency analysis. (This may be equivalent to the rarest byte.) |
| /// |
| /// The second rarest byte is used as a type of guard for quickly detecting |
| /// a mismatch after memchr locates an instance of the rarest byte. This |
| /// is a hedge against pathological cases where the pre-computed frequency |
| /// analysis may be off. (But of course, does not prevent *all* |
| /// pathological cases.) |
| rare2: u8, |
| /// The offset of the second rarest byte in `pat`. |
| rare2i: usize, |
| } |
| |
| impl FreqyPacked { |
| fn new(pat: Vec<u8>) -> FreqyPacked { |
| if pat.is_empty() { |
| return FreqyPacked::empty(); |
| } |
| |
| // Find the rarest two bytes. Try to make them distinct (but it's not |
| // required). |
| let mut rare1 = pat[0]; |
| let mut rare2 = pat[0]; |
| for b in pat[1..].iter().cloned() { |
| if freq_rank(b) < freq_rank(rare1) { |
| rare1 = b; |
| } |
| } |
| for &b in &pat { |
| if rare1 == rare2 { |
| rare2 = b |
| } else if b != rare1 && freq_rank(b) < freq_rank(rare2) { |
| rare2 = b; |
| } |
| } |
| |
| // And find the offsets of their last occurrences. |
| let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap(); |
| let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); |
| |
| let char_len = char_len_lossy(&pat); |
| FreqyPacked { |
| pat: pat, |
| char_len: char_len, |
| rare1: rare1, |
| rare1i: rare1i, |
| rare2: rare2, |
| rare2i: rare2i, |
| } |
| } |
| |
| fn empty() -> FreqyPacked { |
| FreqyPacked { |
| pat: vec![], |
| char_len: 0, |
| rare1: 0, |
| rare1i: 0, |
| rare2: 0, |
| rare2i: 0, |
| } |
| } |
| |
| #[inline(always)] // reduces constant overhead |
| pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
| let pat = &*self.pat; |
| if haystack.len() < pat.len() || pat.is_empty() { |
| return None; |
| } |
| let mut i = self.rare1i; |
| while i < haystack.len() { |
| i += match memchr(self.rare1, &haystack[i..]) { |
| None => return None, |
| Some(i) => i, |
| }; |
| let start = i - self.rare1i; |
| let end = start + pat.len(); |
| if end > haystack.len() { |
| return None; |
| } |
| let aligned = &haystack[start..end]; |
| if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat { |
| return Some(start); |
| } |
| i += 1; |
| } |
| None |
| } |
| |
| #[inline(always)] // reduces constant overhead |
| pub fn is_suffix(&self, text: &[u8]) -> bool { |
| if text.len() < self.len() { |
| return false; |
| } |
| text[text.len() - self.len()..] == *self.pat |
| } |
| |
| pub fn len(&self) -> usize { |
| self.pat.len() |
| } |
| |
| pub fn char_len(&self) -> usize { |
| self.char_len |
| } |
| |
| fn approximate_size(&self) -> usize { |
| self.pat.len() * mem::size_of::<u8>() |
| } |
| } |
| |
| fn char_len_lossy(bytes: &[u8]) -> usize { |
| String::from_utf8_lossy(bytes).chars().count() |
| } |
| |
| /// An implementation of Tuned Boyer-Moore as laid out by |
| /// Andrew Hume and Daniel Sunday in "Fast String Searching". |
| /// O(n) in the size of the input. |
| /// |
| /// Fast string searching algorithms come in many variations, |
| /// but they can generally be described in terms of three main |
| /// components. |
| /// |
| /// The skip loop is where the string searcher wants to spend |
| /// as much time as possible. Exactly which character in the |
| /// pattern the skip loop examines varies from algorithm to |
| /// algorithm, but in the simplest case this loop repeated |
| /// looks at the last character in the pattern and jumps |
| /// forward in the input if it is not in the pattern. |
| /// Robert Boyer and J Moore called this the "fast" loop in |
| /// their original paper. |
| /// |
| /// The match loop is responsible for actually examining the |
| /// whole potentially matching substring. In order to fail |
| /// faster, the match loop sometimes has a guard test attached. |
| /// The guard test uses frequency analysis of the different |
| /// characters in the pattern to choose the least frequency |
| /// occurring character and use it to find match failures |
| /// as quickly as possible. |
| /// |
| /// The shift rule governs how the algorithm will shuffle its |
| /// test window in the event of a failure during the match loop. |
| /// Certain shift rules allow the worst-case run time of the |
| /// algorithm to be shown to be O(n) in the size of the input |
| /// rather than O(nm) in the size of the input and the size |
| /// of the pattern (as naive Boyer-Moore is). |
| /// |
| /// "Fast String Searching", in addition to presenting a tuned |
| /// algorithm, provides a comprehensive taxonomy of the many |
| /// different flavors of string searchers. Under that taxonomy |
| /// TBM, the algorithm implemented here, uses an unrolled fast |
| /// skip loop with memchr fallback, a forward match loop with guard, |
| /// and the mini Sunday's delta shift rule. To unpack that you'll have to |
| /// read the paper. |
| #[derive(Clone, Debug)] |
| pub struct BoyerMooreSearch { |
| /// The pattern we are going to look for in the haystack. |
| pattern: Vec<u8>, |
| |
| /// The skip table for the skip loop. |
| /// |
| /// Maps the character at the end of the input |
| /// to a shift. |
| skip_table: Vec<usize>, |
| |
| /// The guard character (least frequently occurring char). |
| guard: u8, |
| /// The reverse-index of the guard character in the pattern. |
| guard_reverse_idx: usize, |
| |
| /// Daniel Sunday's mini generalized delta2 shift table. |
| /// |
| /// We use a skip loop, so we only have to provide a shift |
| /// for the skip char (last char). This is why it is a mini |
| /// shift rule. |
| md2_shift: usize, |
| } |
| |
| impl BoyerMooreSearch { |
| /// Create a new string searcher, performing whatever |
| /// compilation steps are required. |
| fn new(pattern: Vec<u8>) -> Self { |
| debug_assert!(pattern.len() > 0); |
| |
| let (g, gi) = Self::select_guard(pattern.as_slice()); |
| let skip_table = Self::compile_skip_table(pattern.as_slice()); |
| let md2_shift = Self::compile_md2_shift(pattern.as_slice()); |
| BoyerMooreSearch { |
| pattern: pattern, |
| skip_table: skip_table, |
| guard: g, |
| guard_reverse_idx: gi, |
| md2_shift: md2_shift, |
| } |
| } |
| |
| /// Find the pattern in `haystack`, returning the offset |
| /// of the start of the first occurrence of the pattern |
| /// in `haystack`. |
| #[inline] |
| fn find(&self, haystack: &[u8]) -> Option<usize> { |
| if haystack.len() < self.pattern.len() { |
| return None; |
| } |
| |
| let mut window_end = self.pattern.len() - 1; |
| |
| // Inspired by the grep source. It is a way |
| // to do correct loop unrolling without having to place |
| // a crashpad of terminating charicters at the end in |
| // the way described in the Fast String Searching paper. |
| const NUM_UNROLL: usize = 10; |
| // 1 for the initial position, and 1 for the md2 shift |
| let short_circut = (NUM_UNROLL + 2) * self.pattern.len(); |
| |
| if haystack.len() > short_circut { |
| // just 1 for the md2 shift |
| let backstop = haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len()); |
| loop { |
| window_end = match self.skip_loop(haystack, window_end, backstop) { |
| Some(i) => i, |
| None => return None, |
| }; |
| if window_end >= backstop { |
| break; |
| } |
| |
| if self.check_match(haystack, window_end) { |
| return Some(window_end - (self.pattern.len() - 1)); |
| } else { |
| let skip = self.skip_table[haystack[window_end] as usize]; |
| window_end += |
| if skip == 0 { self.md2_shift } else { skip }; |
| continue; |
| } |
| } |
| } |
| |
| // now process the input after the backstop |
| while window_end < haystack.len() { |
| let mut skip = self.skip_table[haystack[window_end] as usize]; |
| if skip == 0 { |
| if self.check_match(haystack, window_end) { |
| return Some(window_end - (self.pattern.len() - 1)); |
| } else { |
| skip = self.md2_shift; |
| } |
| } |
| window_end += skip; |
| } |
| |
| None |
| } |
| |
| fn len(&self) -> usize { |
| return self.pattern.len() |
| } |
| |
| /// The key heuristic behind which the BoyerMooreSearch lives. |
| /// |
| /// See `rust-lang/regex/issues/408`. |
| /// |
| /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled |
| /// platform-specific memchr routine with a bit of frequency |
| /// analysis sprinkled on top actually wins most of the time. |
| /// However, there are a few cases where Tuned Boyer-Moore still |
| /// wins. |
| /// |
| /// If the haystack is random, frequency analysis doesn't help us, |
| /// so Boyer-Moore will win for sufficiently large needles. |
| /// Unfortunately, there is no obvious way to determine this |
| /// ahead of time. |
| /// |
| /// If the pattern itself consists of very common characters, |
| /// frequency analysis won't get us anywhere. The most extreme |
| /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately, |
| /// this case is wholly determined by the pattern, so we can actually |
| /// implement the heuristic. |
| /// |
| /// A third case is if the pattern is sufficiently long. The idea |
| /// here is that once the pattern gets long enough the Tuned |
| /// Boyer-Moore skip loop will start making strides long enough |
| /// to beat the asm deep magic that is memchr. |
| fn should_use(pattern: &[u8]) -> bool { |
| // The minimum pattern length required to use TBM. |
| const MIN_LEN: usize = 9; |
| // The minimum frequency rank (lower is rarer) that every byte in the |
| // pattern must have in order to use TBM. That is, if the pattern |
| // contains _any_ byte with a lower rank, then TBM won't be used. |
| const MIN_CUTOFF: usize = 150; |
| // The maximum frequency rank for any byte. |
| const MAX_CUTOFF: usize = 255; |
| // The scaling factor used to determine the actual cutoff frequency |
| // to use (keeping in mind that the minimum frequency rank is bounded |
| // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more |
| // likely to be used as the pattern grows longer. That is, longer |
| // patterns permit somewhat less frequent bytes than shorter patterns, |
| // under the assumption that TBM gets better as the pattern gets |
| // longer. |
| const LEN_CUTOFF_PROPORTION: usize = 4; |
| |
| let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION); |
| let cutoff = cmp::max( |
| MIN_CUTOFF, |
| MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank), |
| ); |
| // The pattern must be long enough to be worthwhile. e.g., memchr will |
| // be faster on `e` because it is short even though e is quite common. |
| pattern.len() > MIN_LEN |
| // all the bytes must be more common than the cutoff. |
| && pattern.iter().all(|c| freq_rank(*c) >= cutoff) |
| } |
| |
| /// Check to see if there is a match at the given position |
| #[inline] |
| fn check_match(&self, haystack: &[u8], window_end: usize) -> bool { |
| // guard test |
| if haystack[window_end - self.guard_reverse_idx] != self.guard { |
| return false; |
| } |
| |
| // match loop |
| let window_start = window_end - (self.pattern.len() - 1); |
| for i in 0..self.pattern.len() { |
| if self.pattern[i] != haystack[window_start + i] { |
| return false; |
| } |
| } |
| |
| true |
| } |
| |
| /// Skip forward according to the shift table. |
| /// |
| /// Returns the offset of the next occurrence |
| /// of the last char in the pattern, or the none |
| /// if it never reappears. If `skip_loop` hits the backstop |
| /// it will leave early. |
| #[inline] |
| fn skip_loop(&self, |
| haystack: &[u8], |
| mut window_end: usize, |
| backstop: usize, |
| ) -> Option<usize> { |
| use std::mem; |
| |
| let window_end_snapshot = window_end; |
| let skip_of = |we: usize| -> usize { |
| // Unsafe might make this faster, but the benchmarks |
| // were hard to interpret. |
| self.skip_table[haystack[we] as usize] |
| }; |
| |
| loop { |
| let mut skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| if skip != 0 { |
| skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| if skip != 0 { |
| skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| if skip != 0 { |
| skip = skip_of(window_end); window_end += skip; |
| skip = skip_of(window_end); window_end += skip; |
| |
| // If ten iterations did not make at least 16 words |
| // worth of progress, we just fall back on memchr. |
| if window_end - window_end_snapshot > |
| 16 * mem::size_of::<usize>() { |
| |
| // Returning a window_end >= backstop will immediatly |
| // break us out of the inner loop in `find`. |
| if window_end >= backstop { |
| return Some(window_end); |
| } |
| |
| continue; // we made enough progress |
| } else { |
| // In case we are already there, and so that |
| // we will catch the guard char. |
| window_end = window_end |
| .checked_sub(1 + self.guard_reverse_idx) |
| .unwrap_or(0); |
| |
| match memchr(self.guard, &haystack[window_end..]) { |
| None => return None, |
| Some(g_idx) => { |
| return Some(window_end + g_idx + self.guard_reverse_idx); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| return Some(window_end); |
| } |
| } |
| |
| /// Compute the ufast skip table. |
| fn compile_skip_table(pattern: &[u8]) -> Vec<usize> { |
| let mut tab = vec![pattern.len(); 256]; |
| |
| // For every char in the pattern, we write a skip |
| // that will line us up with the rightmost occurrence. |
| // |
| // N.B. the sentinel (0) is written by the last |
| // loop iteration. |
| for (i, c) in pattern.iter().enumerate() { |
| tab[*c as usize] = (pattern.len() - 1) - i; |
| } |
| |
| tab |
| } |
| |
| /// Select the guard character based off of the precomputed |
| /// frequency table. |
| fn select_guard(pattern: &[u8]) -> (u8, usize) { |
| let mut rarest = pattern[0]; |
| let mut rarest_rev_idx = pattern.len() - 1; |
| for (i, c) in pattern.iter().enumerate() { |
| if freq_rank(*c) < freq_rank(rarest) { |
| rarest = *c; |
| rarest_rev_idx = (pattern.len() - 1) - i; |
| } |
| } |
| |
| (rarest, rarest_rev_idx) |
| } |
| |
| /// If there is another occurrence of the skip |
| /// char, shift to it, otherwise just shift to |
| /// the next window. |
| fn compile_md2_shift(pattern: &[u8]) -> usize { |
| let shiftc = *pattern.last().unwrap(); |
| |
| // For a pattern of length 1 we will never apply the |
| // shift rule, so we use a poison value on the principle |
| // that failing fast is a good thing. |
| if pattern.len() == 1 { |
| return 0xDEADBEAF; |
| } |
| |
| let mut i = pattern.len() - 2; |
| while i > 0 { |
| if pattern[i] == shiftc { |
| return (pattern.len() - 1) - i; |
| } |
| i -= 1; |
| } |
| |
| // The skip char never re-occurs in the pattern, so |
| // we can just shift the whole window length. |
| pattern.len() - 1 |
| } |
| |
| fn approximate_size(&self) -> usize { |
| (self.pattern.len() * mem::size_of::<u8>()) |
| + (256 * mem::size_of::<usize>()) // skip table |
| } |
| } |
| |
| fn freq_rank(b: u8) -> usize { |
| BYTE_FREQUENCIES[b as usize] as usize |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::{BoyerMooreSearch, FreqyPacked}; |
| |
| // |
| // Unit Tests |
| // |
| |
| // The "hello, world" of string searching |
| #[test] |
| fn bm_find_subs() { |
| let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); |
| let haystack = b"I keep seeing patterns in this text"; |
| assert_eq!(14, searcher.find(haystack).unwrap()); |
| } |
| |
| #[test] |
| fn bm_find_no_subs() { |
| let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..])); |
| let haystack = b"I keep seeing needles in this text"; |
| assert_eq!(None, searcher.find(haystack)); |
| } |
| |
| // |
| // Regression Tests |
| // |
| |
| #[test] |
| fn bm_skip_reset_bug() { |
| let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0]; |
| let needle = vec![0, 1, 1, 0]; |
| |
| let searcher = BoyerMooreSearch::new(needle); |
| let offset = searcher.find(haystack.as_slice()).unwrap(); |
| assert_eq!(4, offset); |
| } |
| |
| #[test] |
| fn bm_backstop_underflow_bug() { |
| let haystack = vec![0, 0]; |
| let needle = vec![0, 0]; |
| |
| let searcher = BoyerMooreSearch::new(needle); |
| let offset = searcher.find(haystack.as_slice()).unwrap(); |
| assert_eq!(0, offset); |
| } |
| |
| #[test] |
| fn bm_naive_off_by_one_bug() { |
| let haystack = vec![91]; |
| let needle = vec![91]; |
| |
| let naive_offset = naive_find(&needle, &haystack).unwrap(); |
| assert_eq!(0, naive_offset); |
| } |
| |
| #[test] |
| fn bm_memchr_fallback_indexing_bug() { |
| let mut haystack = vec![ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; |
| let needle = vec![1, 1, 1, 1, 32, 32, 87]; |
| let needle_start = haystack.len(); |
| haystack.extend(needle.clone()); |
| |
| let searcher = BoyerMooreSearch::new(needle); |
| assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap()); |
| } |
| |
| #[test] |
| fn bm_backstop_boundary() { |
| let haystack = b"\ |
| // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa |
| e_data.clone_created(entity_id, entity_to_add.entity_id); |
| aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa |
| aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa |
| ".to_vec(); |
| let needle = b"clone_created".to_vec(); |
| |
| let searcher = BoyerMooreSearch::new(needle); |
| let result = searcher.find(&haystack); |
| assert_eq!(Some(43), result); |
| } |
| |
| #[test] |
| fn bm_win_gnu_indexing_bug() { |
| let haystack_raw = vec![ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; |
| let needle = vec![1, 1, 1, 1, 1, 1, 1]; |
| let haystack = haystack_raw.as_slice(); |
| |
| BoyerMooreSearch::new(needle.clone()).find(haystack); |
| } |
| |
| // |
| // QuickCheck Properties |
| // |
| |
| use quickcheck::TestResult; |
| |
| fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> { |
| assert!(needle.len() <= haystack.len()); |
| |
| for i in 0..(haystack.len() - (needle.len() - 1)) { |
| if haystack[i] == needle[0] |
| && &haystack[i..(i+needle.len())] == needle { |
| return Some(i) |
| } |
| } |
| |
| None |
| } |
| |
| quickcheck! { |
| fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { |
| if pile1.len() == 0 || pile2.len() == 0 { |
| return TestResult::discard(); |
| } |
| |
| let (needle, haystack) = if pile1.len() < pile2.len() { |
| (pile1, pile2.as_slice()) |
| } else { |
| (pile2, pile1.as_slice()) |
| }; |
| |
| let searcher = BoyerMooreSearch::new(needle.clone()); |
| TestResult::from_bool( |
| searcher.find(haystack) == naive_find(&needle, haystack)) |
| } |
| |
| fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult { |
| if pile1.len() == 0 || pile2.len() == 0 { |
| return TestResult::discard(); |
| } |
| |
| let (needle, haystack) = if pile1.len() < pile2.len() { |
| (pile1, pile2.as_slice()) |
| } else { |
| (pile2, pile1.as_slice()) |
| }; |
| |
| let bm_searcher = BoyerMooreSearch::new(needle.clone()); |
| let freqy_memchr = FreqyPacked::new(needle); |
| TestResult::from_bool( |
| bm_searcher.find(haystack) == freqy_memchr.find(haystack)) |
| } |
| |
| fn qc_bm_finds_trailing_needle( |
| haystack_pre: Vec<u8>, |
| needle: Vec<u8> |
| ) -> TestResult { |
| if needle.len() == 0 { |
| return TestResult::discard(); |
| } |
| |
| let mut haystack = haystack_pre.clone(); |
| let searcher = BoyerMooreSearch::new(needle.clone()); |
| |
| if haystack.len() >= needle.len() && |
| searcher.find(haystack.as_slice()).is_some() { |
| return TestResult::discard(); |
| } |
| |
| haystack.extend(needle.clone()); |
| |
| // What if the the tail of the haystack can start the |
| // needle? |
| let start = haystack_pre.len() |
| .checked_sub(needle.len()) |
| .unwrap_or(0); |
| for i in 0..(needle.len() - 1) { |
| if searcher.find(&haystack[(i + start)..]).is_some() { |
| return TestResult::discard(); |
| } |
| } |
| |
| TestResult::from_bool( |
| searcher.find(haystack.as_slice()) |
| .map(|x| x == haystack_pre.len()) |
| .unwrap_or(false)) |
| } |
| |
| // qc_equals_* is only testing the negative case as @burntsushi |
| // pointed out in https://github.com/rust-lang/regex/issues/446. |
| // This quickcheck prop represents an effort to force testing of |
| // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle |
| // already check some of the positive cases, but they don't cover |
| // cases where the needle is in the middle of haystack. This prop |
| // fills that hole. |
| fn qc_bm_finds_subslice( |
| haystack: Vec<u8>, |
| needle_start: usize, |
| needle_length: usize |
| ) -> TestResult { |
| if haystack.len() == 0 { |
| return TestResult::discard(); |
| } |
| |
| let needle_start = needle_start % haystack.len(); |
| let needle_length = needle_length % (haystack.len() - needle_start); |
| |
| if needle_length == 0 { |
| return TestResult::discard(); |
| } |
| |
| let needle = &haystack[needle_start..(needle_start + needle_length)]; |
| |
| let bm_searcher = BoyerMooreSearch::new(needle.to_vec()); |
| |
| let start = naive_find(&needle, &haystack); |
| match start { |
| None => TestResult::from_bool(false), |
| Some(nf_start) => |
| TestResult::from_bool( |
| nf_start <= needle_start |
| && bm_searcher.find(&haystack) == start |
| ) |
| } |
| } |
| |
| fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult { |
| if needle.len() == 0 { |
| return TestResult::discard(); |
| } |
| |
| let mut haystack = needle.clone(); |
| let searcher = BoyerMooreSearch::new(needle.clone()); |
| haystack.extend(needle); |
| |
| TestResult::from_bool( |
| searcher.find(haystack.as_slice()) |
| .map(|x| x == 0) |
| .unwrap_or(false)) |
| } |
| } |
| } |