| // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use std::cmp; |
| use std::fmt; |
| use std::iter; |
| use std::mem; |
| use std::ops; |
| |
| use {Expr, CharClass, ClassRange, ByteClass, ByteRange, Repeater}; |
| |
| /// A set of literal byte strings extracted from a regular expression. |
| /// |
| /// Every member of the set is a `Lit`, which is represented by a `Vec<u8>`. |
| /// (Notably, it may contain invalid UTF-8.) Every member is said to be either |
| /// *complete* or *cut*. A complete literal means that it extends until the |
| /// beginning (or end) of the regular expression. In some circumstances, this |
| /// can be used to indicate a match in the regular expression. |
| /// |
| /// Note that a key aspect of literal extraction is knowing when to stop. It is |
| /// not feasible to blindly extract all literals from a regular expression, |
| /// even if there are finitely many. For example, the regular expression |
| /// `[0-9]{10}` has `10^10` distinct literals. For this reason, literal |
| /// extraction is bounded to some low number by default using heuristics, but |
| /// the limits can be tweaked. |
| #[derive(Clone, Eq, PartialEq)] |
| pub struct Literals { |
| lits: Vec<Lit>, |
| limit_size: usize, |
| limit_class: usize, |
| } |
| |
| /// A single member of a set of literals extracted from a regular expression. |
| /// |
| /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice |
| /// and `Vec` operations are available. |
| #[derive(Clone, Eq, Ord)] |
| pub struct Lit { |
| v: Vec<u8>, |
| cut: bool, |
| } |
| |
| impl Literals { |
| /// Returns a new empty set of literals using default limits. |
| pub fn empty() -> Literals { |
| Literals { |
| lits: vec![], |
| limit_size: 250, |
| limit_class: 10, |
| } |
| } |
| |
| /// Get the approximate size limit (in bytes) of this set. |
| pub fn limit_size(&self) -> usize { |
| self.limit_size |
| } |
| |
| /// Set the approximate size limit (in bytes) of this set. |
| /// |
| /// If extracting a literal would put the set over this limit, then |
| /// extraction stops. |
| /// |
| /// The new limits will only apply to additions to this set. Existing |
| /// members remain unchanged, even if the set exceeds the new limit. |
| pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { |
| self.limit_size = size; |
| self |
| } |
| |
| /// Get the character class size limit for this set. |
| pub fn limit_class(&self) -> usize { |
| self.limit_class |
| } |
| |
| /// Limits the size of character(or byte) classes considered. |
| /// |
| /// A value of `0` prevents all character classes from being considered. |
| /// |
| /// This limit also applies to case insensitive literals, since each |
| /// character in the case insensitive literal is converted to a class, and |
| /// then case folded. |
| /// |
| /// The new limits will only apply to additions to this set. Existing |
| /// members remain unchanged, even if the set exceeds the new limit. |
| pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { |
| self.limit_class = size; |
| self |
| } |
| |
| /// Returns the set of literals as a slice. Its order is unspecified. |
| pub fn literals(&self) -> &[Lit] { |
| &self.lits |
| } |
| |
| /// Returns the length of the smallest literal. |
| /// |
| /// Returns None is there are no literals in the set. |
| pub fn min_len(&self) -> Option<usize> { |
| let mut min = None; |
| for lit in &self.lits { |
| match min { |
| None => min = Some(lit.len()), |
| Some(m) if lit.len() < m => min = Some(lit.len()), |
| _ => {} |
| } |
| } |
| min |
| } |
| |
| /// Returns true if all members in this set are complete. |
| pub fn all_complete(&self) -> bool { |
| !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) |
| } |
| |
| /// Returns true if any member in this set is complete. |
| pub fn any_complete(&self) -> bool { |
| self.lits.iter().any(|lit| !lit.is_cut()) |
| } |
| |
| /// Returns true if this set contains an empty literal. |
| pub fn contains_empty(&self) -> bool { |
| self.lits.iter().any(|lit| lit.is_empty()) |
| } |
| |
| /// Returns true if this set is empty or if all of its members is empty. |
| pub fn is_empty(&self) -> bool { |
| self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) |
| } |
| |
| /// Returns a new empty set of literals using this set's limits. |
| pub fn to_empty(&self) -> Literals { |
| let mut lits = Literals::empty(); |
| lits.set_limit_size(self.limit_size) |
| .set_limit_class(self.limit_class); |
| lits |
| } |
| |
| /// Returns the longest common prefix of all members in this set. |
| pub fn longest_common_prefix(&self) -> &[u8] { |
| if self.is_empty() { |
| return &[]; |
| } |
| let lit0 = &*self.lits[0]; |
| let mut len = lit0.len(); |
| for lit in &self.lits[1..] { |
| len = cmp::min( |
| len, |
| lit.iter() |
| .zip(lit0) |
| .take_while(|&(a, b)| a == b) |
| .count()); |
| } |
| &self.lits[0][..len] |
| } |
| |
| /// Returns the longest common suffix of all members in this set. |
| pub fn longest_common_suffix(&self) -> &[u8] { |
| if self.is_empty() { |
| return &[]; |
| } |
| let lit0 = &*self.lits[0]; |
| let mut len = lit0.len(); |
| for lit in &self.lits[1..] { |
| len = cmp::min( |
| len, |
| lit.iter() |
| .rev() |
| .zip(lit0.iter().rev()) |
| .take_while(|&(a, b)| a == b) |
| .count()); |
| } |
| &self.lits[0][self.lits[0].len() - len..] |
| } |
| |
| /// Returns a new set of literals with the given number of bytes trimmed |
| /// from the suffix of each literal. |
| /// |
| /// If any literal would be cut out completely by trimming, then None is |
| /// returned. |
| /// |
| /// Any duplicates that are created as a result of this transformation are |
| /// removed. |
| pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { |
| if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { |
| return None; |
| } |
| let mut new = self.to_empty(); |
| for mut lit in self.lits.iter().cloned() { |
| let new_len = lit.len() - num_bytes; |
| lit.truncate(new_len); |
| lit.cut(); |
| new.lits.push(lit); |
| } |
| new.lits.sort(); |
| new.lits.dedup(); |
| Some(new) |
| } |
| |
| /// Returns a new set of prefixes of this set of literals that are |
| /// guaranteed to be unambiguous. |
| /// |
| /// Any substring match with a member of the set is returned is guaranteed |
| /// to never overlap with a substring match of another member of the set |
| /// at the same starting position. |
| /// |
| /// Given any two members of the returned set, neither is a substring of |
| /// the other. |
| pub fn unambiguous_prefixes(&self) -> Literals { |
| if self.lits.is_empty() { |
| return self.to_empty(); |
| } |
| let mut old: Vec<Lit> = self.lits.iter().cloned().collect(); |
| let mut new = self.to_empty(); |
| 'OUTER: |
| while let Some(mut candidate) = old.pop() { |
| if candidate.is_empty() { |
| continue; |
| } |
| if new.lits.is_empty() { |
| new.lits.push(candidate); |
| continue; |
| } |
| for lit2 in &mut new.lits { |
| if lit2.is_empty() { |
| continue; |
| } |
| if &candidate == lit2 { |
| // If the literal is already in the set, then we can |
| // just drop it. But make sure that cut literals are |
| // infectious! |
| candidate.cut = candidate.cut || lit2.cut; |
| lit2.cut = candidate.cut; |
| continue 'OUTER; |
| } |
| if candidate.len() < lit2.len() { |
| if let Some(i) = position(&candidate, &lit2) { |
| candidate.cut(); |
| let mut lit3 = lit2.clone(); |
| lit3.truncate(i); |
| lit3.cut(); |
| old.push(lit3); |
| lit2.clear(); |
| } |
| } else { |
| if let Some(i) = position(&lit2, &candidate) { |
| lit2.cut(); |
| let mut new_candidate = candidate.clone(); |
| new_candidate.truncate(i); |
| new_candidate.cut(); |
| old.push(new_candidate); |
| candidate.clear(); |
| } |
| } |
| // Oops, the candidate is already represented in the set. |
| if candidate.is_empty() { |
| continue 'OUTER; |
| } |
| } |
| new.lits.push(candidate); |
| } |
| new.lits.retain(|lit| !lit.is_empty()); |
| new.lits.sort(); |
| new.lits.dedup(); |
| new |
| } |
| |
| /// Returns a new set of suffixes of this set of literals that are |
| /// guaranteed to be unambiguous. |
| /// |
| /// Any substring match with a member of the set is returned is guaranteed |
| /// to never overlap with a substring match of another member of the set |
| /// at the same ending position. |
| /// |
| /// Given any two members of the returned set, neither is a substring of |
| /// the other. |
| pub fn unambiguous_suffixes(&self) -> Literals { |
| // This is a touch wasteful... |
| let mut lits = self.clone(); |
| lits.reverse(); |
| let mut unamb = lits.unambiguous_prefixes(); |
| unamb.reverse(); |
| unamb |
| } |
| |
| /// Unions the prefixes from the given expression to this set. |
| /// |
| /// If prefixes could not be added (for example, this set would exceed its |
| /// size limits or the set of prefixes from `expr` includes the empty |
| /// string), then false is returned. |
| /// |
| /// Note that prefix literals extracted from `expr` are said to be complete |
| /// if and only if the literal extends from the beginning of `expr` to the |
| /// end of `expr`. |
| pub fn union_prefixes(&mut self, expr: &Expr) -> bool { |
| let mut lits = self.to_empty(); |
| prefixes(expr, &mut lits); |
| !lits.is_empty() && !lits.contains_empty() && self.union(lits) |
| } |
| |
| /// Unions the suffixes from the given expression to this set. |
| /// |
| /// If suffixes could not be added (for example, this set would exceed its |
| /// size limits or the set of suffixes from `expr` includes the empty |
| /// string), then false is returned. |
| /// |
| /// Note that prefix literals extracted from `expr` are said to be complete |
| /// if and only if the literal extends from the end of `expr` to the |
| /// beginning of `expr`. |
| pub fn union_suffixes(&mut self, expr: &Expr) -> bool { |
| let mut lits = self.to_empty(); |
| suffixes(expr, &mut lits); |
| lits.reverse(); |
| !lits.is_empty() && !lits.contains_empty() && self.union(lits) |
| } |
| |
| /// Unions this set with another set. |
| /// |
| /// If the union would cause the set to exceed its limits, then the union |
| /// is skipped and it returns false. Otherwise, if the union succeeds, it |
| /// returns true. |
| pub fn union(&mut self, lits: Literals) -> bool { |
| if self.num_bytes() + lits.num_bytes() > self.limit_size { |
| return false; |
| } |
| if lits.is_empty() { |
| self.lits.push(Lit::empty()); |
| } else { |
| self.lits.extend(lits.lits); |
| } |
| true |
| } |
| |
| /// Extends this set with another set. |
| /// |
| /// The set of literals is extended via a cross product. |
| /// |
| /// If a cross product would cause this set to exceed its limits, then the |
| /// cross product is skipped and it returns false. Otherwise, if the cross |
| /// product succeeds, it returns true. |
| pub fn cross_product(&mut self, lits: &Literals) -> bool { |
| if lits.is_empty() { |
| return true; |
| } |
| // Check that we make sure we stay in our limits. |
| let mut size_after; |
| if self.is_empty() || !self.any_complete() { |
| size_after = self.num_bytes(); |
| for lits_lit in lits.literals() { |
| size_after += lits_lit.len(); |
| } |
| } else { |
| size_after = self.lits.iter().fold(0, |accum, lit| { |
| accum + if lit.is_cut() { lit.len() } else { 0 } |
| }); |
| for lits_lit in lits.literals() { |
| for self_lit in self.literals() { |
| if !self_lit.is_cut() { |
| size_after += self_lit.len() + lits_lit.len(); |
| } |
| } |
| } |
| } |
| if size_after > self.limit_size { |
| return false; |
| } |
| |
| let mut base = self.remove_complete(); |
| if base.is_empty() { |
| base = vec![Lit::empty()]; |
| } |
| for lits_lit in lits.literals() { |
| for mut self_lit in base.clone() { |
| self_lit.extend(&**lits_lit); |
| self_lit.cut = lits_lit.cut; |
| self.lits.push(self_lit); |
| } |
| } |
| true |
| } |
| |
| /// Extends each literal in this set with the bytes given. |
| /// |
| /// If the set is empty, then the given literal is added to the set. |
| /// |
| /// If adding any number of bytes to all members of this set causes a limit |
| /// to be exceeded, then no bytes are added and false is returned. If a |
| /// prefix of `bytes` can be fit into this set, then it is used and all |
| /// resulting literals are cut. |
| pub fn cross_add(&mut self, bytes: &[u8]) -> bool { |
| // N.B. This could be implemented by simply calling cross_product with |
| // a literal set containing just `bytes`, but we can be smarter about |
| // taking shorter prefixes of `bytes` if they'll fit. |
| if bytes.is_empty() { |
| return true; |
| } |
| if self.lits.is_empty() { |
| let i = cmp::min(self.limit_size, bytes.len()); |
| self.lits.push(Lit::new(bytes[..i].to_owned())); |
| self.lits[0].cut = i < bytes.len(); |
| return !self.lits[0].is_cut(); |
| } |
| let size = self.num_bytes(); |
| if size + self.lits.len() >= self.limit_size { |
| return false; |
| } |
| let mut i = 1; |
| while size + (i * self.lits.len()) <= self.limit_size |
| && i < bytes.len() { |
| i += 1; |
| } |
| for lit in &mut self.lits { |
| if !lit.is_cut() { |
| lit.extend(&bytes[..i]); |
| if i < bytes.len() { |
| lit.cut(); |
| } |
| } |
| } |
| true |
| } |
| |
| /// Adds the given literal to this set. |
| /// |
| /// Returns false if adding this literal would cause the class to be too |
| /// big. |
| pub fn add(&mut self, lit: Lit) -> bool { |
| if self.num_bytes() + lit.len() > self.limit_size { |
| return false; |
| } |
| self.lits.push(lit); |
| true |
| } |
| |
| /// Extends each literal in this set with the character class given. |
| /// |
| /// Returns false if the character class was too big to add. |
| pub fn add_char_class(&mut self, cls: &CharClass) -> bool { |
| self._add_char_class(cls, false) |
| } |
| |
| /// Extends each literal in this set with the character class given, |
| /// writing the bytes of each character in reverse. |
| /// |
| /// Returns false if the character class was too big to add. |
| fn add_char_class_reverse(&mut self, cls: &CharClass) -> bool { |
| self._add_char_class(cls, true) |
| } |
| |
| fn _add_char_class(&mut self, cls: &CharClass, reverse: bool) -> bool { |
| use std::char; |
| |
| if self.class_exceeds_limits(cls.num_chars()) { |
| return false; |
| } |
| let mut base = self.remove_complete(); |
| if base.is_empty() { |
| base = vec![Lit::empty()]; |
| } |
| for r in cls { |
| let (s, e) = (r.start as u32, r.end as u32 + 1); |
| for c in (s..e).filter_map(char::from_u32) { |
| for mut lit in base.clone() { |
| let mut bytes = c.to_string().into_bytes(); |
| if reverse { |
| bytes.reverse(); |
| } |
| lit.extend(&bytes); |
| self.lits.push(lit); |
| } |
| } |
| } |
| true |
| } |
| |
| /// Extends each literal in this set with the byte class given. |
| /// |
| /// Returns false if the byte class was too big to add. |
| pub fn add_byte_class(&mut self, cls: &ByteClass) -> bool { |
| if self.class_exceeds_limits(cls.num_bytes()) { |
| return false; |
| } |
| let mut base = self.remove_complete(); |
| if base.is_empty() { |
| base = vec![Lit::empty()]; |
| } |
| for r in cls { |
| let (s, e) = (r.start as u32, r.end as u32 + 1); |
| for b in (s..e).map(|b| b as u8) { |
| for mut lit in base.clone() { |
| lit.push(b); |
| self.lits.push(lit); |
| } |
| } |
| } |
| true |
| } |
| |
| /// Cuts every member of this set. When a member is cut, it can never |
| /// be extended. |
| pub fn cut(&mut self) { |
| for lit in &mut self.lits { |
| lit.cut(); |
| } |
| } |
| |
| /// Reverses all members in place. |
| pub fn reverse(&mut self) { |
| for lit in &mut self.lits { |
| lit.reverse(); |
| } |
| } |
| |
| /// Clears this set of all members. |
| pub fn clear(&mut self) { |
| self.lits.clear(); |
| } |
| |
| /// Pops all complete literals out of this set. |
| fn remove_complete(&mut self) -> Vec<Lit> { |
| let mut base = vec![]; |
| for lit in mem::replace(&mut self.lits, vec![]) { |
| if lit.is_cut() { |
| self.lits.push(lit); |
| } else { |
| base.push(lit); |
| } |
| } |
| base |
| } |
| |
| /// Returns the total number of bytes in this set. |
| fn num_bytes(&self) -> usize { |
| self.lits.iter().fold(0, |accum, lit| accum + lit.len()) |
| } |
| |
| /// Returns true if a character class with the given size would cause this |
| /// set to exceed its limits. |
| /// |
| /// The size given should correspond to the number of items in the class. |
| fn class_exceeds_limits(&self, size: usize) -> bool { |
| if size > self.limit_class { |
| return true; |
| } |
| // This is an approximation since codepoints in a char class can encode |
| // to 1-4 bytes. |
| let new_byte_count = |
| if self.lits.is_empty() { |
| size |
| } else { |
| self.lits |
| .iter() |
| .fold(0, |accum, lit| { |
| accum + if lit.is_cut() { |
| // If the literal is cut, then we'll never add |
| // anything to it, so don't count it. |
| 0 |
| } else { |
| (lit.len() + 1) * size |
| } |
| }) |
| }; |
| new_byte_count > self.limit_size |
| } |
| } |
| |
| fn prefixes(expr: &Expr, lits: &mut Literals) { |
| use Expr::*; |
| match *expr { |
| Literal { ref chars, casei: false } => { |
| let s: String = chars.iter().cloned().collect(); |
| lits.cross_add(s.as_bytes()); |
| } |
| Literal { ref chars, casei: true } => { |
| for &c in chars { |
| let cls = CharClass::new(vec![ |
| ClassRange { start: c, end: c }, |
| ]).case_fold(); |
| if !lits.add_char_class(&cls) { |
| lits.cut(); |
| return; |
| } |
| } |
| } |
| LiteralBytes { ref bytes, casei: false } => { |
| lits.cross_add(bytes); |
| } |
| LiteralBytes { ref bytes, casei: true } => { |
| for &b in bytes { |
| let cls = ByteClass::new(vec![ |
| ByteRange { start: b, end: b }, |
| ]).case_fold(); |
| if !lits.add_byte_class(&cls) { |
| lits.cut(); |
| return; |
| } |
| } |
| } |
| Class(ref cls) => { |
| if !lits.add_char_class(cls) { |
| lits.cut(); |
| } |
| } |
| ClassBytes(ref cls) => { |
| if !lits.add_byte_class(cls) { |
| lits.cut(); |
| } |
| } |
| Group { ref e, .. } => { |
| prefixes(&**e, lits); |
| } |
| Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { |
| repeat_zero_or_one_literals(&**e, lits, prefixes); |
| } |
| Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { |
| repeat_zero_or_more_literals(&**e, lits, prefixes); |
| } |
| Repeat { ref e, r: Repeater::OneOrMore, .. } => { |
| repeat_one_or_more_literals(&**e, lits, prefixes); |
| } |
| Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { |
| repeat_range_literals(&**e, min, max, greedy, lits, prefixes); |
| } |
| Concat(ref es) if es.is_empty() => {} |
| Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), |
| Concat(ref es) => { |
| for e in es { |
| if let StartText = *e { |
| if !lits.is_empty() { |
| lits.cut(); |
| break; |
| } |
| lits.add(Lit::empty()); |
| continue; |
| } |
| let mut lits2 = lits.to_empty(); |
| prefixes(e, &mut lits2); |
| if !lits.cross_product(&lits2) || !lits2.any_complete() { |
| // If this expression couldn't yield any literal that |
| // could be extended, then we need to quit. Since we're |
| // short-circuiting, we also need to freeze every member. |
| lits.cut(); |
| break; |
| } |
| } |
| } |
| Alternate(ref es) => { |
| alternate_literals(es, lits, prefixes); |
| } |
| _ => lits.cut(), |
| } |
| } |
| |
| fn suffixes(expr: &Expr, lits: &mut Literals) { |
| use Expr::*; |
| match *expr { |
| Literal { ref chars, casei: false } => { |
| let s: String = chars.iter().cloned().collect(); |
| let mut bytes = s.into_bytes(); |
| bytes.reverse(); |
| lits.cross_add(&bytes); |
| } |
| Literal { ref chars, casei: true } => { |
| for &c in chars.iter().rev() { |
| let cls = CharClass::new(vec![ |
| ClassRange { start: c, end: c }, |
| ]).case_fold(); |
| if !lits.add_char_class_reverse(&cls) { |
| lits.cut(); |
| return; |
| } |
| } |
| } |
| LiteralBytes { ref bytes, casei: false } => { |
| let b: Vec<u8> = bytes.iter().rev().cloned().collect(); |
| lits.cross_add(&b); |
| } |
| LiteralBytes { ref bytes, casei: true } => { |
| for &b in bytes.iter().rev() { |
| let cls = ByteClass::new(vec![ |
| ByteRange { start: b, end: b }, |
| ]).case_fold(); |
| if !lits.add_byte_class(&cls) { |
| lits.cut(); |
| return; |
| } |
| } |
| } |
| Class(ref cls) => { |
| if !lits.add_char_class_reverse(cls) { |
| lits.cut(); |
| } |
| } |
| ClassBytes(ref cls) => { |
| if !lits.add_byte_class(cls) { |
| lits.cut(); |
| } |
| } |
| Group { ref e, .. } => { |
| suffixes(&**e, lits); |
| } |
| Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { |
| repeat_zero_or_one_literals(&**e, lits, suffixes); |
| } |
| Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { |
| repeat_zero_or_more_literals(&**e, lits, suffixes); |
| } |
| Repeat { ref e, r: Repeater::OneOrMore, .. } => { |
| repeat_one_or_more_literals(&**e, lits, suffixes); |
| } |
| Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { |
| repeat_range_literals(&**e, min, max, greedy, lits, suffixes); |
| } |
| Concat(ref es) if es.is_empty() => {} |
| Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), |
| Concat(ref es) => { |
| for e in es.iter().rev() { |
| if let EndText = *e { |
| if !lits.is_empty() { |
| lits.cut(); |
| break; |
| } |
| lits.add(Lit::empty()); |
| continue; |
| } |
| let mut lits2 = lits.to_empty(); |
| suffixes(e, &mut lits2); |
| if !lits.cross_product(&lits2) || !lits2.any_complete() { |
| // If this expression couldn't yield any literal that |
| // could be extended, then we need to quit. Since we're |
| // short-circuiting, we also need to freeze every member. |
| lits.cut(); |
| break; |
| } |
| } |
| } |
| Alternate(ref es) => { |
| alternate_literals(es, lits, suffixes); |
| } |
| _ => lits.cut(), |
| } |
| } |
| |
| fn repeat_zero_or_one_literals<F: FnMut(&Expr, &mut Literals)>( |
| e: &Expr, |
| lits: &mut Literals, |
| mut f: F, |
| ) { |
| let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); |
| lits3.set_limit_size(lits.limit_size() / 2); |
| f(e, &mut lits3); |
| |
| if lits3.is_empty() || !lits2.cross_product(&lits3) { |
| lits.cut(); |
| return; |
| } |
| lits2.add(Lit::empty()); |
| if !lits.union(lits2) { |
| lits.cut(); |
| } |
| } |
| |
| fn repeat_zero_or_more_literals<F: FnMut(&Expr, &mut Literals)>( |
| e: &Expr, |
| lits: &mut Literals, |
| mut f: F, |
| ) { |
| let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); |
| lits3.set_limit_size(lits.limit_size() / 2); |
| f(e, &mut lits3); |
| |
| if lits3.is_empty() || !lits2.cross_product(&lits3) { |
| lits.cut(); |
| return; |
| } |
| lits2.cut(); |
| lits2.add(Lit::empty()); |
| if !lits.union(lits2) { |
| lits.cut(); |
| } |
| } |
| |
| fn repeat_one_or_more_literals<F: FnMut(&Expr, &mut Literals)>( |
| e: &Expr, |
| lits: &mut Literals, |
| mut f: F, |
| ) { |
| f(e, lits); |
| lits.cut(); |
| } |
| |
| fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>( |
| e: &Expr, |
| min: u32, |
| max: Option<u32>, |
| greedy: bool, |
| lits: &mut Literals, |
| mut f: F, |
| ) { |
| use Expr::*; |
| |
| if min == 0 { |
| // This is a bit conservative. If `max` is set, then we could |
| // treat this as a finite set of alternations. For now, we |
| // just treat it as `e*`. |
| f(&Repeat { |
| e: Box::new(e.clone()), |
| r: Repeater::ZeroOrMore, |
| greedy: greedy, |
| }, lits); |
| } else { |
| if min > 0 { |
| let n = cmp::min(lits.limit_size, min as usize); |
| let es = iter::repeat(e.clone()).take(n).collect(); |
| f(&Concat(es), lits); |
| if n < min as usize { |
| lits.cut(); |
| } |
| } |
| if max.map_or(true, |max| min < max) { |
| lits.cut(); |
| } |
| } |
| } |
| |
| fn alternate_literals<F: FnMut(&Expr, &mut Literals)>( |
| es: &[Expr], |
| lits: &mut Literals, |
| mut f: F, |
| ) { |
| let mut lits2 = lits.to_empty(); |
| for e in es { |
| let mut lits3 = lits.to_empty(); |
| lits3.set_limit_size(lits.limit_size() / 5); |
| f(e, &mut lits3); |
| if lits3.is_empty() || !lits2.union(lits3) { |
| // If we couldn't find suffixes for *any* of the |
| // alternates, then the entire alternation has to be thrown |
| // away and any existing members must be frozen. Similarly, |
| // if the union couldn't complete, stop and freeze. |
| lits.cut(); |
| return; |
| } |
| } |
| if !lits.cross_product(&lits2) { |
| lits.cut(); |
| } |
| } |
| |
| impl fmt::Debug for Literals { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| f.debug_struct("Literals") |
| .field("lits", &self.lits) |
| .field("limit_size", &self.limit_size) |
| .field("limit_class", &self.limit_class) |
| .finish() |
| } |
| } |
| |
| impl Lit { |
| /// Returns a new complete literal with the bytes given. |
| pub fn new(bytes: Vec<u8>) -> Lit { |
| Lit { v: bytes, cut: false } |
| } |
| |
| /// Returns a new complete empty literal. |
| pub fn empty() -> Lit { |
| Lit { v: vec![], cut: false } |
| } |
| |
| /// Returns true if this literal was "cut." |
| pub fn is_cut(&self) -> bool { |
| self.cut |
| } |
| |
| /// Cuts this literal. |
| pub fn cut(&mut self) { |
| self.cut = true; |
| } |
| } |
| |
| impl PartialEq for Lit { |
| fn eq(&self, other: &Lit) -> bool { |
| self.v == other.v |
| } |
| } |
| |
| impl PartialOrd for Lit { |
| fn partial_cmp(&self, other: &Lit) -> Option<cmp::Ordering> { |
| self.v.partial_cmp(&other.v) |
| } |
| } |
| |
| impl fmt::Debug for Lit { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if self.is_cut() { |
| write!(f, "Cut({})", escape_unicode(&self.v)) |
| } else { |
| write!(f, "Complete({})", escape_unicode(&self.v)) |
| } |
| } |
| } |
| |
| impl AsRef<[u8]> for Lit { |
| fn as_ref(&self) -> &[u8] { &self.v } |
| } |
| |
| impl ops::Deref for Lit { |
| type Target = Vec<u8>; |
| fn deref(&self) -> &Vec<u8> { &self.v } |
| } |
| |
| impl ops::DerefMut for Lit { |
| fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v } |
| } |
| |
| fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { |
| let mut i = 0; |
| while haystack.len() >= needle.len() { |
| if needle == &haystack[..needle.len()] { |
| return Some(i); |
| } |
| i += 1; |
| haystack = &haystack[1..]; |
| } |
| None |
| } |
| |
| fn escape_unicode(bytes: &[u8]) -> String { |
| let show = match ::std::str::from_utf8(bytes) { |
| Ok(v) => v.to_string(), |
| Err(_) => escape_bytes(bytes), |
| }; |
| let mut space_escaped = String::new(); |
| for c in show.chars() { |
| if c.is_whitespace() { |
| let escaped = if c as u32 <= 0x7F { |
| escape_byte(c as u8) |
| } else { |
| if c as u32 <= 0xFFFF { |
| format!(r"\u{{{:04x}}}", c as u32) |
| } else { |
| format!(r"\U{{{:08x}}}", c as u32) |
| } |
| }; |
| space_escaped.push_str(&escaped); |
| } else { |
| space_escaped.push(c); |
| } |
| } |
| space_escaped |
| } |
| |
| fn escape_bytes(bytes: &[u8]) -> String { |
| let mut s = String::new(); |
| for &b in bytes { |
| s.push_str(&escape_byte(b)); |
| } |
| s |
| } |
| |
| fn escape_byte(byte: u8) -> String { |
| use std::ascii::escape_default; |
| |
| let escaped: Vec<u8> = escape_default(byte).collect(); |
| String::from_utf8_lossy(&escaped).into_owned() |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use std::fmt; |
| |
| use {Expr, ExprBuilder}; |
| use super::{Literals, Lit, escape_bytes}; |
| |
| // To make test failures easier to read. |
| #[derive(Debug, Eq, PartialEq)] |
| struct Bytes(Vec<ULit>); |
| #[derive(Debug, Eq, PartialEq)] |
| struct Unicode(Vec<ULit>); |
| |
| fn escape_lits(blits: &[Lit]) -> Vec<ULit> { |
| let mut ulits = vec![]; |
| for blit in blits { |
| ulits.push(ULit { v: escape_bytes(&blit), cut: blit.is_cut() }); |
| } |
| ulits |
| } |
| |
| fn create_lits<I: IntoIterator<Item=Lit>>(it: I) -> Literals { |
| Literals { |
| lits: it.into_iter().collect(), |
| limit_size: 0, |
| limit_class: 0, |
| } |
| } |
| |
| // Needs to be pub for 1.3? |
| #[derive(Clone, Eq, PartialEq)] |
| pub struct ULit { |
| v: String, |
| cut: bool, |
| } |
| |
| impl ULit { |
| fn is_cut(&self) -> bool { self.cut } |
| } |
| |
| impl fmt::Debug for ULit { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if self.is_cut() { |
| write!(f, "Cut({})", self.v) |
| } else { |
| write!(f, "Complete({})", self.v) |
| } |
| } |
| } |
| |
| impl PartialEq<Lit> for ULit { |
| fn eq(&self, other: &Lit) -> bool { |
| self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() |
| } |
| } |
| |
| impl PartialEq<ULit> for Lit { |
| fn eq(&self, other: &ULit) -> bool { |
| &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() |
| } |
| } |
| |
| #[allow(non_snake_case)] |
| fn C(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: true } } |
| #[allow(non_snake_case)] |
| fn M(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: false } } |
| |
| fn prefixes(lits: &mut Literals, expr: &Expr) { |
| lits.union_prefixes(expr); |
| } |
| |
| fn suffixes(lits: &mut Literals, expr: &Expr) { |
| lits.union_suffixes(expr); |
| } |
| |
| macro_rules! assert_lit_eq { |
| ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ |
| let expected: Vec<ULit> = vec![$($expected_lit),*]; |
| let lits = $got_lits; |
| assert_eq!( |
| $which(expected.clone()), |
| $which(escape_lits(lits.literals()))); |
| assert_eq!( |
| !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), |
| lits.all_complete()); |
| assert_eq!( |
| expected.iter().any(|l| !l.is_cut()), |
| lits.any_complete()); |
| }}; |
| } |
| |
| macro_rules! test_lit { |
| ($name:ident, $which:ident, $re:expr) => { |
| test_lit!($name, $which, $re,); |
| }; |
| ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { |
| #[test] |
| fn $name() { |
| let expr = Expr::parse($re).unwrap(); |
| let lits = expr.$which(); |
| assert_lit_eq!(Unicode, lits, $($lit),*); |
| |
| let expr = ExprBuilder::new().allow_bytes(true).unicode(false) |
| .parse($re).unwrap(); |
| let lits = expr.$which(); |
| assert_lit_eq!(Bytes, lits, $($lit),*); |
| } |
| }; |
| } |
| |
| // ************************************************************************ |
| // Tests for prefix literal extraction. |
| // ************************************************************************ |
| |
| // Elementary tests. |
| test_lit!(pfx_one_lit1, prefixes, "a", M("a")); |
| test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); |
| test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); |
| test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); |
| test_lit!(pfx_class1, prefixes, "[1-4]", |
| M("1"), M("2"), M("3"), M("4")); |
| test_lit!(pfx_class2, prefixes, "(?u)[â˜ƒâ… ]", |
| M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); |
| test_lit!(pfx_class3, prefixes, "(?ui)[â˜ƒâ… ]", |
| M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), |
| M("\\xe2\\x98\\x83")); |
| test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", |
| M("A"), M("a")); |
| test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc", |
| M("ABC"), M("aBC"), M("AbC"), M("abC"), |
| M("ABc"), M("aBc"), M("Abc"), M("abc")); |
| test_lit!(pfx_group1, prefixes, "(a)", M("a")); |
| test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); |
| test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); |
| test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); |
| test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); |
| test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); |
| test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); |
| test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); |
| test_lit!(pfx_rep_range1, prefixes, "a{0}"); |
| test_lit!(pfx_rep_range2, prefixes, "a{0,}"); |
| test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); |
| test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); |
| test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); |
| test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); |
| test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); |
| |
| // Test regexes with concatenations. |
| test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); |
| test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); |
| test_lit!(pfx_cat3, prefixes, "(?i)[ab]z", |
| M("AZ"), M("BZ"), M("aZ"), M("bZ"), |
| M("Az"), M("Bz"), M("az"), M("bz")); |
| test_lit!(pfx_cat4, prefixes, "[ab][yz]", |
| M("ay"), M("by"), M("az"), M("bz")); |
| test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); |
| test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); |
| test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); |
| test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); |
| test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); |
| test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); |
| test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); |
| test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); |
| test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); |
| test_lit!(pfx_cat14, prefixes, "a^", C("a")); |
| test_lit!(pfx_cat15, prefixes, "$a"); |
| test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); |
| test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); |
| test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); |
| test_lit!(pfx_cat19, prefixes, "a.z", C("a")); |
| |
| // Test regexes with alternations. |
| test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); |
| test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); |
| test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); |
| test_lit!(pfx_alt4, prefixes, "a|b*"); |
| test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); |
| test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); |
| test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c", |
| C("a"), C("b"), M("c"), C("a"), C("ab"), M("c")); |
| test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); |
| |
| // Test regexes with empty assertions. |
| test_lit!(pfx_empty1, prefixes, "^a", M("a")); |
| test_lit!(pfx_empty2, prefixes, "^abc", M("abc")); |
| test_lit!(pfx_empty3, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); |
| |
| // Make sure some curious regexes have no prefixes. |
| test_lit!(pfx_nothing1, prefixes, "."); |
| test_lit!(pfx_nothing2, prefixes, "(?s)."); |
| test_lit!(pfx_nothing3, prefixes, "^"); |
| test_lit!(pfx_nothing4, prefixes, "$"); |
| test_lit!(pfx_nothing6, prefixes, "(?m)$"); |
| test_lit!(pfx_nothing7, prefixes, r"\b"); |
| test_lit!(pfx_nothing8, prefixes, r"\B"); |
| |
| // Test a few regexes that defeat any prefix literal detection. |
| test_lit!(pfx_defeated1, prefixes, ".a"); |
| test_lit!(pfx_defeated2, prefixes, "(?s).a"); |
| test_lit!(pfx_defeated3, prefixes, "a*b*c*"); |
| test_lit!(pfx_defeated4, prefixes, "a|."); |
| test_lit!(pfx_defeated5, prefixes, ".|a"); |
| test_lit!(pfx_defeated6, prefixes, "a|^"); |
| test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); |
| test_lit!(pfx_defeated8, prefixes, "$a"); |
| test_lit!(pfx_defeated9, prefixes, "(?m)$a"); |
| test_lit!(pfx_defeated10, prefixes, r"\ba"); |
| test_lit!(pfx_defeated11, prefixes, r"\Ba"); |
| test_lit!(pfx_defeated12, prefixes, "^*a"); |
| test_lit!(pfx_defeated13, prefixes, "^+a"); |
| |
| test_lit!( |
| pfx_crazy1, |
| prefixes, |
| r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", |
| C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam")); |
| |
| // ************************************************************************ |
| // Tests for quiting prefix literal search. |
| // ************************************************************************ |
| |
| macro_rules! test_exhausted { |
| ($name:ident, $which:ident, $re:expr) => { |
| test_exhausted!($name, $which, $re,); |
| }; |
| ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { |
| #[test] |
| fn $name() { |
| let expr = Expr::parse($re).unwrap(); |
| let mut lits = Literals::empty(); |
| lits.set_limit_size(20).set_limit_class(10); |
| $which(&mut lits, &expr); |
| assert_lit_eq!(Unicode, lits, $($lit),*); |
| |
| let expr = ExprBuilder::new().allow_bytes(true).unicode(false) |
| .parse($re).unwrap(); |
| let mut lits = Literals::empty(); |
| lits.set_limit_size(20).set_limit_class(10); |
| $which(&mut lits, &expr); |
| assert_lit_eq!(Bytes, lits, $($lit),*); |
| } |
| }; |
| } |
| |
| // These test use a much lower limit than the default so that we can |
| // write test cases of reasonable size. |
| test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); |
| test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); |
| test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); |
| test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar", |
| C("FO"), C("fO"), C("Fo"), C("fo")); |
| test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}", |
| C("abababababababababab")); |
| test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd", |
| C("ababababab"), M("cd")); |
| test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd", |
| C("zababababab"), M("zcd")); |
| test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz", |
| C("aaaaaaaaaaaaaaaaaaaa")); |
| |
| // ************************************************************************ |
| // Tests for suffix literal extraction. |
| // ************************************************************************ |
| |
| // Elementary tests. |
| test_lit!(sfx_one_lit1, suffixes, "a", M("a")); |
| test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); |
| test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); |
| test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); |
| test_lit!(sfx_class1, suffixes, "[1-4]", |
| M("1"), M("2"), M("3"), M("4")); |
| test_lit!(sfx_class2, suffixes, "(?u)[â˜ƒâ… ]", |
| M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); |
| test_lit!(sfx_class3, suffixes, "(?ui)[â˜ƒâ… ]", |
| M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), |
| M("\\xe2\\x98\\x83")); |
| test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", |
| M("A"), M("a")); |
| test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc", |
| M("ABC"), M("ABc"), M("AbC"), M("Abc"), |
| M("aBC"), M("aBc"), M("abC"), M("abc")); |
| test_lit!(sfx_group1, suffixes, "(a)", M("a")); |
| test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); |
| test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); |
| test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); |
| test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); |
| test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); |
| test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); |
| test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); |
| test_lit!(sfx_rep_range1, suffixes, "a{0}"); |
| test_lit!(sfx_rep_range2, suffixes, "a{0,}"); |
| test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); |
| test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); |
| test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); |
| test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); |
| test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); |
| |
| // Test regexes with concatenations. |
| test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); |
| test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); |
| test_lit!(sfx_cat3, suffixes, "(?i)[ab]z", |
| M("AZ"), M("Az"), M("BZ"), M("Bz"), |
| M("aZ"), M("az"), M("bZ"), M("bz")); |
| test_lit!(sfx_cat4, suffixes, "[ab][yz]", |
| M("ay"), M("az"), M("by"), M("bz")); |
| test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); |
| test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); |
| test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); |
| test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); |
| test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); |
| test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); |
| test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); |
| test_lit!(sfx_cat12, suffixes, "ab+", C("b")); |
| test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); |
| test_lit!(sfx_cat14, suffixes, "a^"); |
| test_lit!(sfx_cat15, suffixes, "$a", C("a")); |
| test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); |
| test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); |
| test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); |
| test_lit!(sfx_cat19, suffixes, "a.z", C("z")); |
| |
| // Test regexes with alternations. |
| test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); |
| test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); |
| test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); |
| test_lit!(sfx_alt4, suffixes, "a|b*"); |
| test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); |
| test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); |
| test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c", |
| C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c")); |
| test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); |
| |
| // Test regexes with empty assertions. |
| test_lit!(sfx_empty1, suffixes, "a$", M("a")); |
| |
| // Make sure some curious regexes have no suffixes. |
| test_lit!(sfx_nothing1, suffixes, "."); |
| test_lit!(sfx_nothing2, suffixes, "(?s)."); |
| test_lit!(sfx_nothing3, suffixes, "^"); |
| test_lit!(sfx_nothing4, suffixes, "$"); |
| test_lit!(sfx_nothing6, suffixes, "(?m)$"); |
| test_lit!(sfx_nothing7, suffixes, r"\b"); |
| test_lit!(sfx_nothing8, suffixes, r"\B"); |
| |
| // Test a few regexes that defeat any suffix literal detection. |
| test_lit!(sfx_defeated1, suffixes, "a."); |
| test_lit!(sfx_defeated2, suffixes, "(?s)a."); |
| test_lit!(sfx_defeated3, suffixes, "a*b*c*"); |
| test_lit!(sfx_defeated4, suffixes, "a|."); |
| test_lit!(sfx_defeated5, suffixes, ".|a"); |
| test_lit!(sfx_defeated6, suffixes, "a|^"); |
| test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); |
| test_lit!(sfx_defeated8, suffixes, "a^"); |
| test_lit!(sfx_defeated9, suffixes, "(?m)a$"); |
| test_lit!(sfx_defeated10, suffixes, r"a\b"); |
| test_lit!(sfx_defeated11, suffixes, r"a\B"); |
| test_lit!(sfx_defeated12, suffixes, "a^*"); |
| test_lit!(sfx_defeated13, suffixes, "a^+"); |
| |
| // These test use a much lower limit than the default so that we can |
| // write test cases of reasonable size. |
| test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); |
| test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); |
| test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); |
| test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar", |
| C("AR"), C("Ar"), C("aR"), C("ar")); |
| test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}", |
| C("abababababababababab")); |
| test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*", |
| C("ababababab"), M("cd")); |
| test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z", |
| C("abababababz"), M("cdz")); |
| test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa", |
| C("aaaaaaaaaaaaaaaaaaaa")); |
| |
| // ************************************************************************ |
| // Tests for generating unambiguous literal sets. |
| // ************************************************************************ |
| |
| macro_rules! test_unamb { |
| ($name:ident, $given:expr, $expected:expr) => { |
| #[test] |
| fn $name() { |
| let given: Vec<Lit> = |
| $given |
| .into_iter() |
| .map(|ul| { |
| let cut = ul.is_cut(); |
| Lit { v: ul.v.into_bytes(), cut: cut } |
| }) |
| .collect(); |
| let lits = create_lits(given); |
| let got = lits.unambiguous_prefixes(); |
| assert_eq!($expected, escape_lits(got.literals())); |
| } |
| }; |
| } |
| |
| test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); |
| test_unamb!(unambiguous2, |
| vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]); |
| test_unamb!(unambiguous3, |
| vec![M("Sherlock"), M("Watson")], |
| vec![M("Sherlock"), M("Watson")]); |
| test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); |
| test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); |
| test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); |
| test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); |
| test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); |
| test_unamb!(unambiguous9, |
| vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], |
| vec![C("a"), C("b"), C("c")]); |
| test_unamb!(unambiguous10, |
| vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], |
| vec![C("Mo"), C("Mu")]); |
| test_unamb!(unambiguous11, |
| vec![M("zazb"), M("azb")], vec![C("a"), C("z")]); |
| test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); |
| test_unamb!(unambiguous13, |
| vec![M("ABCX"), M("CDAX"), M("BCX")], |
| vec![C("A"), C("BCX"), C("CD")]); |
| test_unamb!(unambiguous14, |
| vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], |
| vec![M("DSX"), C("I"), C("MGX"), C("MV")]); |
| test_unamb!(unambiguous15, |
| vec![M("IMG_"), M("MG_"), M("CIMG")], |
| vec![C("C"), C("I"), C("MG_")]); |
| |
| |
| // ************************************************************************ |
| // Tests for suffix trimming. |
| // ************************************************************************ |
| macro_rules! test_trim { |
| ($name:ident, $trim:expr, $given:expr, $expected:expr) => { |
| #[test] |
| fn $name() { |
| let given: Vec<Lit> = |
| $given |
| .into_iter() |
| .map(|ul| { |
| let cut = ul.is_cut(); |
| Lit { v: ul.v.into_bytes(), cut: cut } |
| }) |
| .collect(); |
| let lits = create_lits(given); |
| let got = lits.trim_suffix($trim).unwrap(); |
| assert_eq!($expected, escape_lits(got.literals())); |
| } |
| } |
| } |
| |
| test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); |
| test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); |
| test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); |
| test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); |
| |
| // ************************************************************************ |
| // Tests for longest common prefix. |
| // ************************************************************************ |
| |
| macro_rules! test_lcp { |
| ($name:ident, $given:expr, $expected:expr) => { |
| #[test] |
| fn $name() { |
| let given: Vec<Lit> = |
| $given |
| .into_iter() |
| .map(|s: &str| Lit { |
| v: s.to_owned().into_bytes(), |
| cut: false, |
| }) |
| .collect(); |
| let lits = create_lits(given); |
| let got = lits.longest_common_prefix(); |
| assert_eq!($expected, escape_bytes(got)); |
| } |
| }; |
| } |
| |
| test_lcp!(lcp1, vec!["a"], "a"); |
| test_lcp!(lcp2, vec![], ""); |
| test_lcp!(lcp3, vec!["a", "b"], ""); |
| test_lcp!(lcp4, vec!["ab", "ab"], "ab"); |
| test_lcp!(lcp5, vec!["ab", "a"], "a"); |
| test_lcp!(lcp6, vec!["a", "ab"], "a"); |
| test_lcp!(lcp7, vec!["ab", "b"], ""); |
| test_lcp!(lcp8, vec!["b", "ab"], ""); |
| test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); |
| test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); |
| test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); |
| test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); |
| |
| // ************************************************************************ |
| // Tests for longest common suffix. |
| // ************************************************************************ |
| |
| macro_rules! test_lcs { |
| ($name:ident, $given:expr, $expected:expr) => { |
| #[test] |
| fn $name() { |
| let given: Vec<Lit> = |
| $given |
| .into_iter() |
| .map(|s: &str| Lit { |
| v: s.to_owned().into_bytes(), |
| cut: false, |
| }) |
| .collect(); |
| let lits = create_lits(given); |
| let got = lits.longest_common_suffix(); |
| assert_eq!($expected, escape_bytes(got)); |
| } |
| }; |
| } |
| |
| test_lcs!(lcs1, vec!["a"], "a"); |
| test_lcs!(lcs2, vec![], ""); |
| test_lcs!(lcs3, vec!["a", "b"], ""); |
| test_lcs!(lcs4, vec!["ab", "ab"], "ab"); |
| test_lcs!(lcs5, vec!["ab", "a"], ""); |
| test_lcs!(lcs6, vec!["a", "ab"], ""); |
| test_lcs!(lcs7, vec!["ab", "b"], "b"); |
| test_lcs!(lcs8, vec!["b", "ab"], "b"); |
| test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); |
| test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); |
| test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); |
| test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); |
| } |