| // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| /*! |
| This crate provides a regular expression parser and an abstract syntax for |
| regular expressions. The abstract syntax is defined by the `Expr` type. The |
| concrete syntax is enumerated in the |
| [`regex`](../regex/index.html#syntax) |
| crate documentation. |
| |
| Note that since this crate is first and foremost an implementation detail for |
| the `regex` crate, it may experience more frequent breaking changes. It is |
| exposed as a separate crate so that others may use it to do analysis on regular |
| expressions or even build their own matching engine. |
| |
| # Example: parsing an expression |
| |
| Parsing a regular expression can be done with the `Expr::parse` function. |
| |
| ```rust |
| use regex_syntax::Expr; |
| |
| assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![ |
| Expr::Literal { chars: vec!['a', 'b'], casei: false }, |
| Expr::Literal { chars: vec!['y', 'z'], casei: false }, |
| ])); |
| ``` |
| |
| # Example: inspecting an error |
| |
| The parser in this crate provides very detailed error values. For example, |
| if an invalid character class range is given: |
| |
| ```rust |
| use regex_syntax::{Expr, ErrorKind}; |
| |
| let err = Expr::parse(r"[z-a]").unwrap_err(); |
| assert_eq!(err.position(), 4); |
| assert_eq!(err.kind(), &ErrorKind::InvalidClassRange { |
| start: 'z', |
| end: 'a', |
| }); |
| ``` |
| |
| Or unbalanced parentheses: |
| |
| ```rust |
| use regex_syntax::{Expr, ErrorKind}; |
| |
| let err = Expr::parse(r"ab(cd").unwrap_err(); |
| assert_eq!(err.position(), 2); |
| assert_eq!(err.kind(), &ErrorKind::UnclosedParen); |
| ``` |
| */ |
| |
| #![deny(missing_docs)] |
| #![cfg_attr(test, deny(warnings))] |
| |
| #[cfg(test)] extern crate quickcheck; |
| #[cfg(test)] extern crate rand; |
| |
| mod literals; |
| mod parser; |
| mod unicode; |
| |
| use std::ascii; |
| use std::char; |
| use std::cmp::{Ordering, max, min}; |
| use std::fmt; |
| use std::iter::IntoIterator; |
| use std::ops::Deref; |
| use std::result; |
| use std::slice; |
| use std::u8; |
| use std::vec; |
| |
| use unicode::case_folding; |
| |
| use self::Expr::*; |
| use self::Repeater::*; |
| |
| use parser::{Flags, Parser}; |
| |
| pub use literals::{Literals, Lit}; |
| |
| /// A regular expression abstract syntax tree. |
| /// |
| /// An `Expr` represents the abstract syntax of a regular expression. |
| #[derive(Clone, Debug, PartialEq, Eq)] |
| pub enum Expr { |
| /// An empty regex (which never matches any text). |
| Empty, |
| /// A sequence of one or more literal characters to be matched. |
| Literal { |
| /// The characters. |
| chars: Vec<char>, |
| /// Whether to match case insensitively. |
| casei: bool, |
| }, |
| /// A sequence of one or more literal bytes to be matched. |
| LiteralBytes { |
| /// The bytes. |
| bytes: Vec<u8>, |
| /// Whether to match case insensitively. |
| /// |
| /// The interpretation of "case insensitive" in this context is |
| /// ambiguous since `bytes` can be arbitrary. However, a good heuristic |
| /// is to assume that the bytes are ASCII-compatible and do simple |
| /// ASCII case folding. |
| casei: bool, |
| }, |
| /// Match any character. |
| AnyChar, |
| /// Match any character, excluding new line (`0xA`). |
| AnyCharNoNL, |
| /// Match any byte. |
| AnyByte, |
| /// Match any byte, excluding new line (`0xA`). |
| AnyByteNoNL, |
| /// A character class. |
| Class(CharClass), |
| /// A character class with byte ranges only. |
| ClassBytes(ByteClass), |
| /// Match the start of a line or beginning of input. |
| StartLine, |
| /// Match the end of a line or end of input. |
| EndLine, |
| /// Match the beginning of input. |
| StartText, |
| /// Match the end of input. |
| EndText, |
| /// Match a word boundary (word character on one side and a non-word |
| /// character on the other). |
| WordBoundary, |
| /// Match a position that is not a word boundary (word or non-word |
| /// characters on both sides). |
| NotWordBoundary, |
| /// Match an ASCII word boundary. |
| WordBoundaryAscii, |
| /// Match a position that is not an ASCII word boundary. |
| NotWordBoundaryAscii, |
| /// A group, possibly non-capturing. |
| Group { |
| /// The expression inside the group. |
| e: Box<Expr>, |
| /// The capture index (starting at `1`) only for capturing groups. |
| i: Option<usize>, |
| /// The capture name, only for capturing named groups. |
| name: Option<String>, |
| }, |
| /// A repeat operator (`?`, `*`, `+` or `{m,n}`). |
| Repeat { |
| /// The expression to be repeated. Limited to literals, `.`, classes |
| /// or grouped expressions. |
| e: Box<Expr>, |
| /// The type of repeat operator used. |
| r: Repeater, |
| /// Whether the repeat is greedy (match the most) or not (match the |
| /// least). |
| greedy: bool, |
| }, |
| /// A concatenation of expressions. Must be matched one after the other. |
| /// |
| /// N.B. A concat expression can only appear at the top-level or |
| /// immediately inside a group expression. |
| Concat(Vec<Expr>), |
| /// An alternation of expressions. Only one must match. |
| /// |
| /// N.B. An alternate expression can only appear at the top-level or |
| /// immediately inside a group expression. |
| Alternate(Vec<Expr>), |
| } |
| |
| type CaptureIndex = Option<usize>; |
| |
| type CaptureName = Option<String>; |
| |
| /// The type of a repeat operator expression. |
| #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| pub enum Repeater { |
| /// Match zero or one (`?`). |
| ZeroOrOne, |
| /// Match zero or more (`*`). |
| ZeroOrMore, |
| /// Match one or more (`+`). |
| OneOrMore, |
| /// Match for at least `min` and at most `max` (`{m,n}`). |
| /// |
| /// When `max` is `None`, there is no upper bound on the number of matches. |
| Range { |
| /// Lower bound on the number of matches. |
| min: u32, |
| /// Optional upper bound on the number of matches. |
| max: Option<u32>, |
| }, |
| } |
| |
| impl Repeater { |
| /// Returns true if and only if this repetition can match the empty string. |
| fn matches_empty(&self) -> bool { |
| use self::Repeater::*; |
| match *self { |
| ZeroOrOne => true, |
| ZeroOrMore => true, |
| OneOrMore => false, |
| Range { min, .. } => min == 0, |
| } |
| } |
| } |
| |
| /// A character class. |
| /// |
| /// A character class has a canonical format that the parser guarantees. Its |
| /// canonical format is defined by the following invariants: |
| /// |
| /// 1. Given any Unicode scalar value, it is matched by *at most* one character |
| /// range in a canonical character class. |
| /// 2. Every adjacent character range is separated by at least one Unicode |
| /// scalar value. |
| /// 3. Given any pair of character ranges `r1` and `r2`, if |
| /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical |
| /// character class. |
| /// |
| /// In sum, any `CharClass` produced by this crate's parser is a sorted |
| /// sequence of non-overlapping ranges. This makes it possible to test whether |
| /// a character is matched by a class with a binary search. |
| /// |
| /// If the case insensitive flag was set when parsing a character class, then |
| /// simple case folding is done automatically. For example, `(?i)[a-c]` is |
| /// automatically translated to `[a-cA-C]`. |
| #[derive(Clone, Debug, PartialEq, Eq)] |
| pub struct CharClass { |
| ranges: Vec<ClassRange>, |
| } |
| |
| /// A single inclusive range in a character class. |
| /// |
| /// Since range boundaries are defined by Unicode scalar values, the boundaries |
| /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may |
| /// *cover* codepoints that are not scalar values. |
| /// |
| /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd` |
| /// for testing whether a character is contained inside a given range. |
| #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] |
| pub struct ClassRange { |
| /// The start character of the range. |
| /// |
| /// This must be less than or equal to `end`. |
| pub start: char, |
| |
| /// The end character of the range. |
| /// |
| /// This must be greater than or equal to `start`. |
| pub end: char, |
| } |
| |
| /// A byte class for byte ranges only. |
| /// |
| /// A byte class has a canonical format that the parser guarantees. Its |
| /// canonical format is defined by the following invariants: |
| /// |
| /// 1. Given any byte, it is matched by *at most* one byte range in a canonical |
| /// character class. |
| /// 2. Every adjacent byte range is separated by at least one byte. |
| /// 3. Given any pair of byte ranges `r1` and `r2`, if |
| /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical |
| /// character class. |
| /// |
| /// In sum, any `ByteClass` produced by this crate's parser is a sorted |
| /// sequence of non-overlapping ranges. This makes it possible to test whether |
| /// a byte is matched by a class with a binary search. |
| /// |
| /// If the case insensitive flag was set when parsing a character class, |
| /// then simple ASCII-only case folding is done automatically. For example, |
| /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`. |
| #[derive(Clone, Debug, PartialEq, Eq)] |
| pub struct ByteClass { |
| ranges: Vec<ByteRange>, |
| } |
| |
| /// A single inclusive range in a byte class. |
| /// |
| /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd` |
| /// for testing whether a byte is contained inside a given range. |
| #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] |
| pub struct ByteRange { |
| /// The start byte of the range. |
| /// |
| /// This must be less than or equal to `end`. |
| pub start: u8, |
| |
| /// The end byte of the range. |
| /// |
| /// This must be greater than or equal to `end`. |
| pub end: u8, |
| } |
| |
| /// A builder for configuring regular expression parsing. |
| /// |
| /// This allows setting the default values of flags and other options, such |
| /// as the maximum nesting depth. |
| #[derive(Clone, Debug)] |
| pub struct ExprBuilder { |
| flags: Flags, |
| nest_limit: usize, |
| } |
| |
| impl ExprBuilder { |
| /// Create a new builder for configuring expression parsing. |
| /// |
| /// Note that all flags are disabled by default. |
| pub fn new() -> ExprBuilder { |
| ExprBuilder { |
| flags: Flags::default(), |
| nest_limit: 200, |
| } |
| } |
| |
| /// Set the default value for the case insensitive (`i`) flag. |
| pub fn case_insensitive(mut self, yes: bool) -> ExprBuilder { |
| self.flags.casei = yes; |
| self |
| } |
| |
| /// Set the default value for the multi-line matching (`m`) flag. |
| pub fn multi_line(mut self, yes: bool) -> ExprBuilder { |
| self.flags.multi = yes; |
| self |
| } |
| |
| /// Set the default value for the any character (`s`) flag. |
| pub fn dot_matches_new_line(mut self, yes: bool) -> ExprBuilder { |
| self.flags.dotnl = yes; |
| self |
| } |
| |
| /// Set the default value for the greedy swap (`U`) flag. |
| pub fn swap_greed(mut self, yes: bool) -> ExprBuilder { |
| self.flags.swap_greed = yes; |
| self |
| } |
| |
| /// Set the default value for the ignore whitespace (`x`) flag. |
| pub fn ignore_whitespace(mut self, yes: bool) -> ExprBuilder { |
| self.flags.ignore_space = yes; |
| self |
| } |
| |
| /// Set the default value for the Unicode (`u`) flag. |
| /// |
| /// If `yes` is false, then `allow_bytes` is set to true. |
| pub fn unicode(mut self, yes: bool) -> ExprBuilder { |
| self.flags.unicode = yes; |
| if !yes { |
| self.allow_bytes(true) |
| } else { |
| self |
| } |
| } |
| |
| /// Whether the parser allows matching arbitrary bytes or not. |
| /// |
| /// When the `u` flag is disabled (either with this builder or in the |
| /// expression itself), the parser switches to interpreting the expression |
| /// as matching arbitrary bytes instead of Unicode codepoints. For example, |
| /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which |
| /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely, |
| /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8. |
| /// |
| /// When `allow_bytes` is disabled (the default), an expression like |
| /// `(?-u:\xFF)` will cause the parser to return an error, since it would |
| /// otherwise match invalid UTF-8. When enabled, it will be allowed. |
| pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder { |
| self.flags.allow_bytes = yes; |
| self |
| } |
| |
| /// Set the nesting limit for regular expression parsing. |
| /// |
| /// Regular expressions that nest more than this limit will result in a |
| /// `StackExhausted` error. |
| pub fn nest_limit(mut self, limit: usize) -> ExprBuilder { |
| self.nest_limit = limit; |
| self |
| } |
| |
| /// Parse a string as a regular expression using the current configuraiton. |
| pub fn parse(self, s: &str) -> Result<Expr> { |
| Parser::parse(s, self.flags).and_then(|e| e.simplify(self.nest_limit)) |
| } |
| } |
| |
| impl Expr { |
| /// Parses a string in a regular expression syntax tree. |
| /// |
| /// This is a convenience method for parsing an expression using the |
| /// default configuration. To tweak parsing options (such as which flags |
| /// are enabled by default), use the `ExprBuilder` type. |
| pub fn parse(s: &str) -> Result<Expr> { |
| ExprBuilder::new().parse(s) |
| } |
| |
| /// Returns true iff the expression can be repeated by a quantifier. |
| fn can_repeat(&self) -> bool { |
| match *self { |
| Literal{..} | LiteralBytes{..} |
| | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL |
| | Class(_) | ClassBytes(_) |
| | StartLine | EndLine | StartText | EndText |
| | WordBoundary | NotWordBoundary |
| | WordBoundaryAscii | NotWordBoundaryAscii |
| | Group{..} |
| => true, |
| _ => false, |
| } |
| } |
| |
| fn simplify(self, nest_limit: usize) -> Result<Expr> { |
| fn combine_literals(es: &mut Vec<Expr>, e: Expr) { |
| match (es.pop(), e) { |
| (None, e) => es.push(e), |
| (Some(Literal { chars: mut chars1, casei: casei1 }), |
| Literal { chars: chars2, casei: casei2 }) => { |
| if casei1 == casei2 { |
| chars1.extend(chars2); |
| es.push(Literal { chars: chars1, casei: casei1 }); |
| } else { |
| es.push(Literal { chars: chars1, casei: casei1 }); |
| es.push(Literal { chars: chars2, casei: casei2 }); |
| } |
| } |
| (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }), |
| LiteralBytes { bytes: bytes2, casei: casei2 }) => { |
| if casei1 == casei2 { |
| bytes1.extend(bytes2); |
| es.push(LiteralBytes { bytes: bytes1, casei: casei1 }); |
| } else { |
| es.push(LiteralBytes { bytes: bytes1, casei: casei1 }); |
| es.push(LiteralBytes { bytes: bytes2, casei: casei2 }); |
| } |
| } |
| (Some(e1), e2) => { |
| es.push(e1); |
| es.push(e2); |
| } |
| } |
| } |
| fn simp(expr: Expr, recurse: usize, limit: usize) -> Result<Expr> { |
| if recurse > limit { |
| return Err(Error { |
| pos: 0, |
| surround: "".to_owned(), |
| kind: ErrorKind::StackExhausted, |
| }); |
| } |
| let simplify = |e| simp(e, recurse + 1, limit); |
| Ok(match expr { |
| Repeat { e, r, greedy } => Repeat { |
| e: Box::new(try!(simplify(*e))), |
| r: r, |
| greedy: greedy, |
| }, |
| Group { e, i, name } => { |
| let e = try!(simplify(*e)); |
| if i.is_none() && name.is_none() && e.can_repeat() { |
| e |
| } else { |
| Group { e: Box::new(e), i: i, name: name } |
| } |
| } |
| Concat(es) => { |
| let mut new_es = Vec::with_capacity(es.len()); |
| for e in es { |
| combine_literals(&mut new_es, try!(simplify(e))); |
| } |
| if new_es.len() == 1 { |
| new_es.pop().unwrap() |
| } else { |
| Concat(new_es) |
| } |
| } |
| Alternate(es) => { |
| let mut new_es = Vec::with_capacity(es.len()); |
| for e in es { |
| new_es.push(try!(simplify(e))); |
| } |
| Alternate(new_es) |
| } |
| e => e, |
| }) |
| } |
| simp(self, 0, nest_limit) |
| } |
| |
| /// Returns a set of literal prefixes extracted from this expression. |
| pub fn prefixes(&self) -> Literals { |
| let mut lits = Literals::empty(); |
| lits.union_prefixes(self); |
| lits |
| } |
| |
| /// Returns a set of literal suffixes extracted from this expression. |
| pub fn suffixes(&self) -> Literals { |
| let mut lits = Literals::empty(); |
| lits.union_suffixes(self); |
| lits |
| } |
| |
| /// Returns true if and only if the expression is required to match from |
| /// the beginning of text. |
| pub fn is_anchored_start(&self) -> bool { |
| match *self { |
| Repeat { ref e, r, .. } => { |
| !r.matches_empty() && e.is_anchored_start() |
| } |
| Group { ref e, .. } => e.is_anchored_start(), |
| Concat(ref es) => es[0].is_anchored_start(), |
| Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()), |
| StartText => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if and only if the expression has at least one matchable |
| /// sub-expression that must match the beginning of text. |
| pub fn has_anchored_start(&self) -> bool { |
| match *self { |
| Repeat { ref e, r, .. } => { |
| !r.matches_empty() && e.has_anchored_start() |
| } |
| Group { ref e, .. } => e.has_anchored_start(), |
| Concat(ref es) => es[0].has_anchored_start(), |
| Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()), |
| StartText => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if and only if the expression is required to match at the |
| /// end of the text. |
| pub fn is_anchored_end(&self) -> bool { |
| match *self { |
| Repeat { ref e, r, .. } => { |
| !r.matches_empty() && e.is_anchored_end() |
| } |
| Group { ref e, .. } => e.is_anchored_end(), |
| Concat(ref es) => es[es.len() - 1].is_anchored_end(), |
| Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()), |
| EndText => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if and only if the expression has at least one matchable |
| /// sub-expression that must match the beginning of text. |
| pub fn has_anchored_end(&self) -> bool { |
| match *self { |
| Repeat { ref e, r, .. } => { |
| !r.matches_empty() && e.has_anchored_end() |
| } |
| Group { ref e, .. } => e.has_anchored_end(), |
| Concat(ref es) => es[es.len() - 1].has_anchored_end(), |
| Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()), |
| EndText => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if and only if the expression contains sub-expressions |
| /// that can match arbitrary bytes. |
| pub fn has_bytes(&self) -> bool { |
| match *self { |
| Repeat { ref e, .. } => e.has_bytes(), |
| Group { ref e, .. } => e.has_bytes(), |
| Concat(ref es) => es.iter().any(|e| e.has_bytes()), |
| Alternate(ref es) => es.iter().any(|e| e.has_bytes()), |
| LiteralBytes{..} => true, |
| AnyByte | AnyByteNoNL => true, |
| ClassBytes(_) => true, |
| WordBoundaryAscii | NotWordBoundaryAscii => true, |
| _ => false, |
| } |
| } |
| } |
| |
| impl Deref for CharClass { |
| type Target = Vec<ClassRange>; |
| fn deref(&self) -> &Vec<ClassRange> { &self.ranges } |
| } |
| |
| impl IntoIterator for CharClass { |
| type Item = ClassRange; |
| type IntoIter = vec::IntoIter<ClassRange>; |
| fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() } |
| } |
| |
| impl<'a> IntoIterator for &'a CharClass { |
| type Item = &'a ClassRange; |
| type IntoIter = slice::Iter<'a, ClassRange>; |
| fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() } |
| } |
| |
| impl CharClass { |
| /// Create a new class from an existing set of ranges. |
| pub fn new(ranges: Vec<ClassRange>) -> CharClass { |
| CharClass { ranges: ranges } |
| } |
| |
| /// Create an empty class. |
| fn empty() -> CharClass { |
| CharClass::new(Vec::new()) |
| } |
| |
| /// Returns true if `c` is matched by this character class. |
| pub fn matches(&self, c: char) -> bool { |
| self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok() |
| } |
| |
| /// Removes the given character from the class if it exists. |
| /// |
| /// Note that this takes `O(n)` time in the number of ranges. |
| pub fn remove(&mut self, c: char) { |
| let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) { |
| Ok(i) => i, |
| Err(_) => return, |
| }; |
| let mut r = self.ranges.remove(i); |
| if r.start == c { |
| r.start = inc_char(c); |
| if r.start > r.end || c == char::MAX { |
| return; |
| } |
| self.ranges.insert(i, r); |
| } else if r.end == c { |
| r.end = dec_char(c); |
| if r.end < r.start || c == '\x00' { |
| return; |
| } |
| self.ranges.insert(0, r); |
| } else { |
| let (mut r1, mut r2) = (r.clone(), r.clone()); |
| r1.end = dec_char(c); |
| if r1.start <= r1.end { |
| self.ranges.insert(i, r1); |
| i += 1; |
| } |
| r2.start = inc_char(c); |
| if r2.start <= r2.end { |
| self.ranges.insert(i, r2); |
| } |
| } |
| } |
| |
| /// Create a new empty class from this one. |
| fn to_empty(&self) -> CharClass { |
| CharClass { ranges: Vec::with_capacity(self.len()) } |
| } |
| |
| /// Create a byte class from this character class. |
| /// |
| /// Codepoints above 0xFF are removed. |
| fn to_byte_class(self) -> ByteClass { |
| ByteClass::new( |
| self.ranges.into_iter() |
| .filter_map(|r| r.to_byte_range()) |
| .collect()).canonicalize() |
| } |
| |
| /// Merge two classes and canonicalize them. |
| #[cfg(test)] |
| fn merge(mut self, other: CharClass) -> CharClass { |
| self.ranges.extend(other); |
| self.canonicalize() |
| } |
| |
| /// Canonicalze any sequence of ranges. |
| /// |
| /// This is responsible for enforcing the canonical format invariants |
| /// as described on the docs for the `CharClass` type. |
| fn canonicalize(mut self) -> CharClass { |
| // TODO: Save some cycles here by checking if already canonicalized. |
| self.ranges.sort(); |
| let mut ordered = self.to_empty(); // TODO: Do this in place? |
| for candidate in self { |
| // If the candidate overlaps with an existing range, then it must |
| // be the most recent range added because we process the candidates |
| // in order. |
| if let Some(or) = ordered.ranges.last_mut() { |
| if or.overlapping(candidate) { |
| *or = or.merge(candidate); |
| continue; |
| } |
| } |
| ordered.ranges.push(candidate); |
| } |
| ordered |
| } |
| |
| /// Negates the character class. |
| /// |
| /// For all `c` where `c` is a Unicode scalar value, `c` matches `self` |
| /// if and only if `c` does not match `self.negate()`. |
| pub fn negate(mut self) -> CharClass { |
| fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) } |
| |
| if self.is_empty() { |
| // Inverting an empty range yields all of Unicode. |
| return CharClass { |
| ranges: vec![ClassRange { start: '\x00', end: '\u{10ffff}' }], |
| }; |
| } |
| self = self.canonicalize(); |
| let mut inv = self.to_empty(); |
| if self[0].start > '\x00' { |
| inv.ranges.push(range('\x00', dec_char(self[0].start))); |
| } |
| for win in self.windows(2) { |
| inv.ranges.push(range(inc_char(win[0].end), |
| dec_char(win[1].start))); |
| } |
| if self[self.len() - 1].end < char::MAX { |
| inv.ranges.push(range(inc_char(self[self.len() - 1].end), |
| char::MAX)); |
| } |
| inv |
| } |
| |
| /// Apply case folding to this character class. |
| /// |
| /// N.B. Applying case folding to a negated character class probably |
| /// won't produce the expected result. e.g., `(?i)[^x]` really should |
| /// match any character sans `x` and `X`, but if `[^x]` is negated |
| /// before being case folded, you'll end up matching any character. |
| pub fn case_fold(self) -> CharClass { |
| let mut folded = self.to_empty(); |
| for r in self { |
| // Applying case folding to a range is expensive because *every* |
| // character needs to be examined. Thus, we avoid that drudgery |
| // if no character in the current range is in our case folding |
| // table. |
| if r.needs_case_folding() { |
| folded.ranges.extend(r.case_fold()); |
| } |
| folded.ranges.push(r); |
| } |
| folded.canonicalize() |
| } |
| |
| /// Returns the number of characters that match this class. |
| fn num_chars(&self) -> usize { |
| self.ranges.iter() |
| .map(|&r| 1 + (r.end as u32) - (r.start as u32)) |
| .fold(0, |acc, len| acc + len) |
| as usize |
| } |
| } |
| |
| impl ClassRange { |
| /// Create a new class range. |
| /// |
| /// If `end < start`, then the two values are swapped so that |
| /// the invariant `start <= end` is preserved. |
| fn new(start: char, end: char) -> ClassRange { |
| if start <= end { |
| ClassRange { start: start, end: end } |
| } else { |
| ClassRange { start: end, end: start } |
| } |
| } |
| |
| /// Translate this to a byte class. |
| /// |
| /// If the start codepoint exceeds 0xFF, then this returns `None`. |
| /// |
| /// If the end codepoint exceeds 0xFF, then it is set to 0xFF. |
| fn to_byte_range(self) -> Option<ByteRange> { |
| if self.start > '\u{FF}' { |
| None |
| } else { |
| let s = self.start as u8; |
| let e = min('\u{FF}', self.end) as u8; |
| Some(ByteRange::new(s, e)) |
| } |
| } |
| |
| /// Create a range of one character. |
| fn one(c: char) -> ClassRange { |
| ClassRange { start: c, end: c } |
| } |
| |
| /// Returns true if and only if the two ranges are overlapping. Note that |
| /// since ranges are inclusive, `a-c` and `d-f` are overlapping! |
| fn overlapping(self, other: ClassRange) -> bool { |
| max(self.start, other.start) <= inc_char(min(self.end, other.end)) |
| } |
| |
| /// Creates a new range representing the union of `self` and `other. |
| fn merge(self, other: ClassRange) -> ClassRange { |
| ClassRange { |
| start: min(self.start, other.start), |
| end: max(self.end, other.end), |
| } |
| } |
| |
| /// Returns true if and only if this range contains a character that is |
| /// in the case folding table. |
| fn needs_case_folding(self) -> bool { |
| case_folding::C_plus_S_both_table |
| .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok() |
| } |
| |
| /// Apply case folding to this range. |
| /// |
| /// Since case folding might add characters such that the range is no |
| /// longer contiguous, this returns multiple class ranges. They are in |
| /// canonical order. |
| fn case_fold(self) -> Vec<ClassRange> { |
| let table = &case_folding::C_plus_S_both_table; |
| let (s, e) = (self.start as u32, self.end as u32 + 1); |
| let mut start = self.start; |
| let mut end = start; |
| let mut next_case_fold = '\x00'; |
| let mut ranges = Vec::with_capacity(10); |
| for mut c in (s..e).filter_map(char::from_u32) { |
| if c >= next_case_fold { |
| c = match simple_case_fold_both_result(c) { |
| Ok(i) => { |
| for &(c1, c2) in &table[i..] { |
| if c1 != c { |
| break; |
| } |
| if c2 != inc_char(end) { |
| ranges.push(ClassRange::new(start, end)); |
| start = c2; |
| } |
| end = c2; |
| } |
| continue; |
| } |
| Err(i) => { |
| if i < table.len() { |
| next_case_fold = table[i].0; |
| } else { |
| next_case_fold = '\u{10FFFF}'; |
| } |
| c |
| } |
| }; |
| } |
| // The fast path. We know this character doesn't have an entry |
| // in the case folding table. |
| if c != inc_char(end) { |
| ranges.push(ClassRange::new(start, end)); |
| start = c; |
| } |
| end = c; |
| } |
| ranges.push(ClassRange::new(start, end)); |
| ranges |
| } |
| } |
| |
| impl PartialEq<char> for ClassRange { |
| #[inline] |
| fn eq(&self, other: &char) -> bool { |
| self.start <= *other && *other <= self.end |
| } |
| } |
| |
| impl PartialEq<ClassRange> for char { |
| #[inline] |
| fn eq(&self, other: &ClassRange) -> bool { |
| other.eq(self) |
| } |
| } |
| |
| impl PartialOrd<char> for ClassRange { |
| #[inline] |
| fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
| Some(if self == other { |
| Ordering::Equal |
| } else if *other > self.end { |
| Ordering::Greater |
| } else { |
| Ordering::Less |
| }) |
| } |
| } |
| |
| impl PartialOrd<ClassRange> for char { |
| #[inline] |
| fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> { |
| other.partial_cmp(self).map(|o| o.reverse()) |
| } |
| } |
| |
| impl ByteClass { |
| /// Create a new class from an existing set of ranges. |
| pub fn new(ranges: Vec<ByteRange>) -> ByteClass { |
| ByteClass { ranges: ranges } |
| } |
| |
| /// Returns true if `b` is matched by this byte class. |
| pub fn matches(&self, b: u8) -> bool { |
| self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok() |
| } |
| |
| /// Removes the given byte from the class if it exists. |
| /// |
| /// Note that this takes `O(n)` time in the number of ranges. |
| pub fn remove(&mut self, b: u8) { |
| let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) { |
| Ok(i) => i, |
| Err(_) => return, |
| }; |
| let mut r = self.ranges.remove(i); |
| if r.start == b { |
| r.start = b.saturating_add(1); |
| if r.start > r.end || b == u8::MAX { |
| return; |
| } |
| self.ranges.insert(i, r); |
| } else if r.end == b { |
| r.end = b.saturating_sub(1); |
| if r.end < r.start || b == b'\x00' { |
| return; |
| } |
| self.ranges.insert(0, r); |
| } else { |
| let (mut r1, mut r2) = (r.clone(), r.clone()); |
| r1.end = b.saturating_sub(1); |
| if r1.start <= r1.end { |
| self.ranges.insert(i, r1); |
| i += 1; |
| } |
| r2.start = b.saturating_add(1); |
| if r2.start <= r2.end { |
| self.ranges.insert(i, r2); |
| } |
| } |
| } |
| |
| /// Create a new empty class from this one. |
| fn to_empty(&self) -> ByteClass { |
| ByteClass { ranges: Vec::with_capacity(self.len()) } |
| } |
| |
| /// Canonicalze any sequence of ranges. |
| /// |
| /// This is responsible for enforcing the canonical format invariants |
| /// as described on the docs for the `ByteClass` type. |
| fn canonicalize(mut self) -> ByteClass { |
| // TODO: Save some cycles here by checking if already canonicalized. |
| self.ranges.sort(); |
| let mut ordered = self.to_empty(); // TODO: Do this in place? |
| for candidate in self { |
| // If the candidate overlaps with an existing range, then it must |
| // be the most recent range added because we process the candidates |
| // in order. |
| if let Some(or) = ordered.ranges.last_mut() { |
| if or.overlapping(candidate) { |
| *or = or.merge(candidate); |
| continue; |
| } |
| } |
| ordered.ranges.push(candidate); |
| } |
| ordered |
| } |
| |
| /// Negates the byte class. |
| /// |
| /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b` |
| /// does not match `self.negate()`. |
| pub fn negate(mut self) -> ByteClass { |
| fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) } |
| |
| if self.is_empty() { |
| // Inverting an empty range yields all bytes. |
| return ByteClass { |
| ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }], |
| }; |
| } |
| self = self.canonicalize(); |
| let mut inv = self.to_empty(); |
| if self[0].start > b'\x00' { |
| inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1))); |
| } |
| for win in self.windows(2) { |
| inv.ranges.push(range(win[0].end.saturating_add(1), |
| win[1].start.saturating_sub(1))); |
| } |
| if self[self.len() - 1].end < u8::MAX { |
| inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1), |
| u8::MAX)); |
| } |
| inv |
| } |
| |
| /// Apply case folding to this byte class. |
| /// |
| /// This assumes that the bytes in the ranges are ASCII compatible. |
| /// |
| /// N.B. Applying case folding to a negated character class probably |
| /// won't produce the expected result. e.g., `(?i)[^x]` really should |
| /// match any character sans `x` and `X`, but if `[^x]` is negated |
| /// before being case folded, you'll end up matching any character. |
| pub fn case_fold(self) -> ByteClass { |
| let mut folded = self.to_empty(); |
| for r in self { |
| folded.ranges.extend(r.case_fold()); |
| } |
| folded.canonicalize() |
| } |
| |
| /// Returns the number of bytes that match this class. |
| fn num_bytes(&self) -> usize { |
| self.ranges.iter() |
| .map(|&r| 1 + (r.end as u32) - (r.start as u32)) |
| .fold(0, |acc, len| acc + len) |
| as usize |
| } |
| } |
| |
| impl ByteRange { |
| /// Create a new class range. |
| /// |
| /// If `end < start`, then the two values are swapped so that |
| /// the invariant `start <= end` is preserved. |
| fn new(start: u8, end: u8) -> ByteRange { |
| if start <= end { |
| ByteRange { start: start, end: end } |
| } else { |
| ByteRange { start: end, end: start } |
| } |
| } |
| |
| /// Returns true if and only if the two ranges are overlapping. Note that |
| /// since ranges are inclusive, `a-c` and `d-f` are overlapping! |
| fn overlapping(self, other: ByteRange) -> bool { |
| max(self.start, other.start) |
| <= min(self.end, other.end).saturating_add(1) |
| } |
| |
| /// Returns true if and only if the intersection of self and other is non |
| /// empty. |
| fn is_intersect_empty(self, other: ByteRange) -> bool { |
| max(self.start, other.start) > min(self.end, other.end) |
| } |
| |
| /// Creates a new range representing the union of `self` and `other. |
| fn merge(self, other: ByteRange) -> ByteRange { |
| ByteRange { |
| start: min(self.start, other.start), |
| end: max(self.end, other.end), |
| } |
| } |
| |
| /// Apply case folding to this range. |
| /// |
| /// Since case folding might add bytes such that the range is no |
| /// longer contiguous, this returns multiple byte ranges. |
| /// |
| /// This assumes that the bytes in this range are ASCII compatible. |
| fn case_fold(self) -> Vec<ByteRange> { |
| // So much easier than Unicode case folding! |
| let mut ranges = vec![self]; |
| if !ByteRange::new(b'a', b'z').is_intersect_empty(self) { |
| let lower = max(self.start, b'a'); |
| let upper = min(self.end, b'z'); |
| ranges.push(ByteRange::new(lower - 32, upper - 32)); |
| } |
| if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) { |
| let lower = max(self.start, b'A'); |
| let upper = min(self.end, b'Z'); |
| ranges.push(ByteRange::new(lower + 32, upper + 32)); |
| } |
| ranges |
| } |
| } |
| |
| impl Deref for ByteClass { |
| type Target = Vec<ByteRange>; |
| fn deref(&self) -> &Vec<ByteRange> { &self.ranges } |
| } |
| |
| impl IntoIterator for ByteClass { |
| type Item = ByteRange; |
| type IntoIter = vec::IntoIter<ByteRange>; |
| fn into_iter(self) -> vec::IntoIter<ByteRange> { self.ranges.into_iter() } |
| } |
| |
| impl<'a> IntoIterator for &'a ByteClass { |
| type Item = &'a ByteRange; |
| type IntoIter = slice::Iter<'a, ByteRange>; |
| fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() } |
| } |
| |
| impl PartialEq<u8> for ByteRange { |
| #[inline] |
| fn eq(&self, other: &u8) -> bool { |
| self.start <= *other && *other <= self.end |
| } |
| } |
| |
| impl PartialEq<ByteRange> for u8 { |
| #[inline] |
| fn eq(&self, other: &ByteRange) -> bool { |
| other.eq(self) |
| } |
| } |
| |
| impl PartialOrd<u8> for ByteRange { |
| #[inline] |
| fn partial_cmp(&self, other: &u8) -> Option<Ordering> { |
| Some(if self == other { |
| Ordering::Equal |
| } else if *other > self.end { |
| Ordering::Greater |
| } else { |
| Ordering::Less |
| }) |
| } |
| } |
| |
| impl PartialOrd<ByteRange> for u8 { |
| #[inline] |
| fn partial_cmp(&self, other: &ByteRange) -> Option<Ordering> { |
| other.partial_cmp(self).map(|o| o.reverse()) |
| } |
| } |
| |
| /// This implementation of `Display` will write a regular expression from the |
| /// syntax tree. It does not write the original string parsed. |
| impl fmt::Display for Expr { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match *self { |
| Empty => write!(f, ""), |
| Literal { ref chars, casei } => { |
| if casei { |
| try!(write!(f, "(?iu:")); |
| } else { |
| try!(write!(f, "(?u:")); |
| } |
| for &c in chars { |
| try!(write!(f, "{}", quote_char(c))); |
| } |
| try!(write!(f, ")")); |
| Ok(()) |
| } |
| LiteralBytes { ref bytes, casei } => { |
| if casei { |
| try!(write!(f, "(?i-u:")); |
| } else { |
| try!(write!(f, "(?-u:")); |
| } |
| for &b in bytes { |
| try!(write!(f, "{}", quote_byte(b))); |
| } |
| try!(write!(f, ")")); |
| Ok(()) |
| } |
| AnyChar => write!(f, "(?su:.)"), |
| AnyCharNoNL => write!(f, "(?u:.)"), |
| AnyByte => write!(f, "(?s-u:.)"), |
| AnyByteNoNL => write!(f, "(?-u:.)"), |
| Class(ref cls) => write!(f, "{}", cls), |
| ClassBytes(ref cls) => write!(f, "{}", cls), |
| StartLine => write!(f, "(?m:^)"), |
| EndLine => write!(f, "(?m:$)"), |
| StartText => write!(f, r"^"), |
| EndText => write!(f, r"$"), |
| WordBoundary => write!(f, r"(?u:\b)"), |
| NotWordBoundary => write!(f, r"(?u:\B)"), |
| WordBoundaryAscii => write!(f, r"(?-u:\b)"), |
| NotWordBoundaryAscii => write!(f, r"(?-u:\B)"), |
| Group { ref e, i: None, name: None } => write!(f, "(?:{})", e), |
| Group { ref e, name: None, .. } => write!(f, "({})", e), |
| Group { ref e, name: Some(ref n), .. } => { |
| write!(f, "(?P<{}>{})", n, e) |
| } |
| Repeat { ref e, r, greedy } => { |
| match &**e { |
| &Literal { ref chars, .. } if chars.len() > 1 => { |
| try!(write!(f, "(?:{}){}", e, r)) |
| } |
| _ => try!(write!(f, "{}{}", e, r)), |
| } |
| if !greedy { try!(write!(f, "?")); } |
| Ok(()) |
| } |
| Concat(ref es) => { |
| for e in es { |
| try!(write!(f, "{}", e)); |
| } |
| Ok(()) |
| } |
| Alternate(ref es) => { |
| for (i, e) in es.iter().enumerate() { |
| if i > 0 { try!(write!(f, "|")); } |
| try!(write!(f, "{}", e)); |
| } |
| Ok(()) |
| } |
| } |
| } |
| } |
| |
| impl fmt::Display for Repeater { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match *self { |
| ZeroOrOne => write!(f, "?"), |
| ZeroOrMore => write!(f, "*"), |
| OneOrMore => write!(f, "+"), |
| Range { min: s, max: None } => write!(f, "{{{},}}", s), |
| Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s), |
| Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e), |
| } |
| } |
| } |
| |
| impl fmt::Display for CharClass { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| try!(write!(f, "(?u:[")); |
| for range in self.iter() { |
| if range.start == '-' || range.end == '-' { |
| try!(write!(f, "-")); |
| break; |
| } |
| } |
| for range in self.iter() { |
| let mut range = *range; |
| if range.start == '-' { |
| range.start = ((range.start as u8) + 1) as char; |
| } |
| if range.end == '-' { |
| range.end = ((range.end as u8) - 1) as char; |
| } |
| if range.start > range.end { |
| continue; |
| } |
| try!(write!(f, "{}", range)); |
| } |
| try!(write!(f, "])")); |
| Ok(()) |
| } |
| } |
| |
| impl fmt::Display for ClassRange { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "{}-{}", quote_char(self.start), quote_char(self.end)) |
| } |
| } |
| |
| impl fmt::Display for ByteClass { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| try!(write!(f, "(?-u:[")); |
| for range in self.iter() { |
| if range.start == b'-' || range.end == b'-' { |
| try!(write!(f, "-")); |
| break; |
| } |
| } |
| for range in self.iter() { |
| let mut range = *range; |
| if range.start == b'-' { |
| range.start += 1; |
| } |
| if range.end == b'-' { |
| range.start -= 1; |
| } |
| if range.start > range.end { |
| continue; |
| } |
| try!(write!(f, "{}", range)); |
| } |
| try!(write!(f, "])")); |
| Ok(()) |
| } |
| } |
| |
| impl fmt::Display for ByteRange { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end)) |
| } |
| } |
| |
| /// An alias for computations that can return a `Error`. |
| pub type Result<T> = ::std::result::Result<T, Error>; |
| |
| /// A parse error. |
| /// |
| /// This includes details about the specific type of error and a rough |
| /// approximation of where it occurred. |
| #[derive(Clone, Debug, PartialEq)] |
| pub struct Error { |
| pos: usize, |
| surround: String, |
| kind: ErrorKind, |
| } |
| |
| /// The specific type of parse error that can occur. |
| #[derive(Clone, Debug, PartialEq)] |
| pub enum ErrorKind { |
| /// A negation symbol is used twice in flag settings. |
| /// e.g., `(?-i-s)`. |
| DoubleFlagNegation, |
| /// The same capture name was used more than once. |
| /// e.g., `(?P<a>.)(?P<a>.)`. |
| DuplicateCaptureName(String), |
| /// An alternate is empty. e.g., `(|a)`. |
| EmptyAlternate, |
| /// A capture group name is empty. e.g., `(?P<>a)`. |
| EmptyCaptureName, |
| /// A negation symbol was not proceded by any flags. e.g., `(?i-)`. |
| EmptyFlagNegation, |
| /// A group is empty. e.g., `()`. |
| EmptyGroup, |
| /// An invalid number was used in a counted repetition. e.g., `a{b}`. |
| InvalidBase10(String), |
| /// An invalid hexadecimal number was used in an escape sequence. |
| /// e.g., `\xAG`. |
| InvalidBase16(String), |
| /// An invalid capture name was used. e.g., `(?P<0a>b)`. |
| InvalidCaptureName(String), |
| /// An invalid class range was givien. Specifically, when the start of the |
| /// range is greater than the end. e.g., `[z-a]`. |
| InvalidClassRange { |
| /// The first character specified in the range. |
| start: char, |
| /// The second character specified in the range. |
| end: char, |
| }, |
| /// An escape sequence was used in a character class where it is not |
| /// allowed. e.g., `[a-\pN]` or `[\A]`. |
| InvalidClassEscape(Expr), |
| /// An invalid counted repetition min/max was given. e.g., `a{2,1}`. |
| InvalidRepeatRange { |
| /// The first number specified in the repetition. |
| min: u32, |
| /// The second number specified in the repetition. |
| max: u32, |
| }, |
| /// An invalid Unicode scalar value was used in a long hexadecimal |
| /// sequence. e.g., `\x{D800}`. |
| InvalidScalarValue(u32), |
| /// An empty counted repetition operator. e.g., `a{}`. |
| MissingBase10, |
| /// A repetition operator was not applied to an expression. e.g., `*`. |
| RepeaterExpectsExpr, |
| /// A repetition operator was applied to an expression that cannot be |
| /// repeated. e.g., `a+*` or `a|*`. |
| RepeaterUnexpectedExpr(Expr), |
| /// A capture group name that is never closed. e.g., `(?P<a`. |
| UnclosedCaptureName(String), |
| /// An unclosed hexadecimal literal. e.g., `\x{a`. |
| UnclosedHex, |
| /// An unclosed parenthesis. e.g., `(a`. |
| UnclosedParen, |
| /// An unclosed counted repetition operator. e.g., `a{2`. |
| UnclosedRepeat, |
| /// An unclosed named Unicode class. e.g., `\p{Yi`. |
| UnclosedUnicodeName, |
| /// Saw end of regex before class was closed. e.g., `[a`. |
| UnexpectedClassEof, |
| /// Saw end of regex before escape sequence was closed. e.g., `\`. |
| UnexpectedEscapeEof, |
| /// Saw end of regex before flags were closed. e.g., `(?i`. |
| UnexpectedFlagEof, |
| /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`. |
| UnexpectedTwoDigitHexEof, |
| /// Unopened parenthesis. e.g., `)`. |
| UnopenedParen, |
| /// Unrecognized escape sequence. e.g., `\q`. |
| UnrecognizedEscape(char), |
| /// Unrecognized flag. e.g., `(?a)`. |
| UnrecognizedFlag(char), |
| /// Unrecognized named Unicode class. e.g., `\p{Foo}`. |
| UnrecognizedUnicodeClass(String), |
| /// Indicates that the regex uses too much nesting. |
| /// |
| /// (N.B. This error exists because traversing the Expr is recursive and |
| /// an explicit heap allocated stack is not (yet?) used. Regardless, some |
| /// sort of limit must be applied to avoid unbounded memory growth. |
| StackExhausted, |
| /// A disallowed flag was found (e.g., `u`). |
| FlagNotAllowed(char), |
| /// A Unicode class was used when the Unicode (`u`) flag was disabled. |
| UnicodeNotAllowed, |
| /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes. |
| /// This never returned if the parser is permitted to allow expressions |
| /// that match arbitrary bytes. |
| InvalidUtf8, |
| /// A character class was constructed such that it is empty. |
| /// e.g., `[^\d\D]`. |
| EmptyClass, |
| /// Hints that destructuring should not be exhaustive. |
| /// |
| /// This enum may grow additional variants, so this makes sure clients |
| /// don't count on exhaustive matching. (Otherwise, adding a new variant |
| /// could break existing code.) |
| #[doc(hidden)] |
| __Nonexhaustive, |
| } |
| |
| impl Error { |
| /// Returns an approximate *character* offset at which the error occurred. |
| /// |
| /// The character offset may be equal to the number of characters in the |
| /// string, in which case it should be interpreted as pointing to the end |
| /// of the regex. |
| pub fn position(&self) -> usize { |
| self.pos |
| } |
| |
| /// Returns the type of the regex parse error. |
| pub fn kind(&self) -> &ErrorKind { |
| &self.kind |
| } |
| } |
| |
| impl ErrorKind { |
| fn description(&self) -> &str { |
| use ErrorKind::*; |
| match *self { |
| DoubleFlagNegation => "double flag negation", |
| DuplicateCaptureName(_) => "duplicate capture name", |
| EmptyAlternate => "empty alternate", |
| EmptyCaptureName => "empty capture name", |
| EmptyFlagNegation => "flag negation without any flags", |
| EmptyGroup => "empty group (e.g., '()')", |
| InvalidBase10(_) => "invalid base 10 number", |
| InvalidBase16(_) => "invalid base 16 number", |
| InvalidCaptureName(_) => "invalid capture name", |
| InvalidClassRange{..} => "invalid character class range", |
| InvalidClassEscape(_) => "invalid escape sequence in class", |
| InvalidRepeatRange{..} => "invalid counted repetition range", |
| InvalidScalarValue(_) => "invalid Unicode scalar value", |
| MissingBase10 => "missing count in repetition operator", |
| RepeaterExpectsExpr => "repetition operator missing expression", |
| RepeaterUnexpectedExpr(_) => "expression cannot be repeated", |
| UnclosedCaptureName(_) => "unclosed capture group name", |
| UnclosedHex => "unclosed hexadecimal literal", |
| UnclosedParen => "unclosed parenthesis", |
| UnclosedRepeat => "unclosed counted repetition operator", |
| UnclosedUnicodeName => "unclosed Unicode class literal", |
| UnexpectedClassEof => "unexpected EOF in character class", |
| UnexpectedEscapeEof => "unexpected EOF in escape sequence", |
| UnexpectedFlagEof => "unexpected EOF in flags", |
| UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal", |
| UnopenedParen => "unopened parenthesis", |
| UnrecognizedEscape(_) => "unrecognized escape sequence", |
| UnrecognizedFlag(_) => "unrecognized flag", |
| UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name", |
| StackExhausted => "stack exhausted, too much nesting", |
| FlagNotAllowed(_) => "flag not allowed", |
| UnicodeNotAllowed => "Unicode features not allowed", |
| InvalidUtf8 => "matching arbitrary bytes is not allowed", |
| EmptyClass => "empty character class", |
| __Nonexhaustive => unreachable!(), |
| } |
| } |
| } |
| |
| impl ::std::error::Error for Error { |
| fn description(&self) -> &str { |
| self.kind.description() |
| } |
| } |
| |
| impl fmt::Display for Error { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if let ErrorKind::StackExhausted = self.kind { |
| write!(f, "Error parsing regex: {}", self.kind) |
| } else { |
| write!( |
| f, "Error parsing regex near '{}' at character offset {}: {}", |
| self.surround, self.pos, self.kind) |
| } |
| } |
| } |
| |
| impl fmt::Display for ErrorKind { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| use ErrorKind::*; |
| match *self { |
| DoubleFlagNegation => |
| write!(f, "Only one negation symbol is allowed in flags."), |
| DuplicateCaptureName(ref s) => |
| write!(f, "Capture name '{}' is used more than once.", s), |
| EmptyAlternate => |
| write!(f, "Alternations cannot be empty."), |
| EmptyCaptureName => |
| write!(f, "Capture names cannot be empty."), |
| EmptyFlagNegation => |
| write!(f, "Flag negation requires setting at least one flag."), |
| EmptyGroup => |
| write!(f, "Empty regex groups (e.g., '()') are not allowed."), |
| InvalidBase10(ref s) => |
| write!(f, "Not a valid base 10 number: '{}'", s), |
| InvalidBase16(ref s) => |
| write!(f, "Not a valid base 16 number: '{}'", s), |
| InvalidCaptureName(ref s) => |
| write!(f, "Invalid capture name: '{}'. Capture names must \ |
| consist of [_a-zA-Z0-9] and are not allowed to \ |
| start with with a number.", s), |
| InvalidClassRange { start, end } => |
| write!(f, "Invalid character class range '{}-{}'. \ |
| Character class ranges must start with the smaller \ |
| character, but {} > {}", start, end, start, end), |
| InvalidClassEscape(ref e) => |
| write!(f, "Invalid escape sequence in character \ |
| class: '{}'.", e), |
| InvalidRepeatRange { min, max } => |
| write!(f, "Invalid counted repetition range: {{{}, {}}}. \ |
| Counted repetition ranges must start with the \ |
| minimum, but {} > {}", min, max, min, max), |
| InvalidScalarValue(c) => |
| write!(f, "Number does not correspond to a Unicode scalar \ |
| value: '{}'.", c), |
| MissingBase10 => |
| write!(f, "Missing maximum in counted reptition operator."), |
| RepeaterExpectsExpr => |
| write!(f, "Missing expression for reptition operator."), |
| RepeaterUnexpectedExpr(ref e) => |
| write!(f, "Invalid application of reptition operator to: \ |
| '{}'.", e), |
| UnclosedCaptureName(ref s) => |
| write!(f, "Capture name group for '{}' is not closed. \ |
| (Missing a '>'.)", s), |
| UnclosedHex => |
| write!(f, "Unclosed hexadecimal literal (missing a '}}')."), |
| UnclosedParen => |
| write!(f, "Unclosed parenthesis."), |
| UnclosedRepeat => |
| write!(f, "Unclosed counted repetition (missing a '}}')."), |
| UnclosedUnicodeName => |
| write!(f, "Unclosed Unicode literal (missing a '}}')."), |
| UnexpectedClassEof => |
| write!(f, "Character class was not closed before the end of \ |
| the regex (missing a ']')."), |
| UnexpectedEscapeEof => |
| write!(f, "Started an escape sequence that didn't finish \ |
| before the end of the regex."), |
| UnexpectedFlagEof => |
| write!(f, "Inline flag settings was not closed before the end \ |
| of the regex (missing a ')' or ':')."), |
| UnexpectedTwoDigitHexEof => |
| write!(f, "Unexpected end of two digit hexadecimal literal."), |
| UnopenedParen => |
| write!(f, "Unopened parenthesis."), |
| UnrecognizedEscape(c) => |
| write!(f, "Unrecognized escape sequence: '\\{}'.", c), |
| UnrecognizedFlag(c) => |
| write!(f, "Unrecognized flag: '{}'. \ |
| (Allowed flags: i, m, s, U, u, x.)", c), |
| UnrecognizedUnicodeClass(ref s) => |
| write!(f, "Unrecognized Unicode class name: '{}'.", s), |
| StackExhausted => |
| write!(f, "Exhausted space required to parse regex with too \ |
| much nesting."), |
| FlagNotAllowed(flag) => |
| write!(f, "Use of the flag '{}' is not allowed.", flag), |
| UnicodeNotAllowed => |
| write!(f, "Unicode features are not allowed when the Unicode \ |
| (u) flag is not set."), |
| InvalidUtf8 => |
| write!(f, "Matching arbitrary bytes is not allowed."), |
| EmptyClass => |
| write!(f, "Empty character classes are not allowed."), |
| __Nonexhaustive => unreachable!(), |
| } |
| } |
| } |
| |
| /// The result of binary search on the simple case folding table. |
| /// |
| /// Note that this binary search is done on the "both" table, such that |
| /// the index returned corresponds to the *first* location of `c1` in the |
| /// table. The table can then be scanned linearly starting from the position |
| /// returned to find other case mappings for `c1`. |
| fn simple_case_fold_both_result(c1: char) -> result::Result<usize, usize> { |
| let table = &case_folding::C_plus_S_both_table; |
| let i = binary_search(table, |&(c2, _)| c1 <= c2); |
| if i >= table.len() || table[i].0 != c1 { |
| Err(i) |
| } else { |
| Ok(i) |
| } |
| } |
| |
| /// Binary search to find first element such that `pred(T) == true`. |
| /// |
| /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`. |
| /// |
| /// If all elements yield `pred(T) == false`, then `xs.len()` is returned. |
| fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize |
| where F: FnMut(&T) -> bool { |
| let (mut left, mut right) = (0, xs.len()); |
| while left < right { |
| let mid = (left + right) / 2; |
| if pred(&xs[mid]) { |
| right = mid; |
| } else { |
| left = mid + 1; |
| } |
| } |
| left |
| } |
| |
| /// Escapes all regular expression meta characters in `text`. |
| /// |
| /// The string returned may be safely used as a literal in a regular |
| /// expression. |
| pub fn quote(text: &str) -> String { |
| let mut quoted = String::with_capacity(text.len()); |
| for c in text.chars() { |
| if parser::is_punct(c) { |
| quoted.push('\\'); |
| } |
| quoted.push(c); |
| } |
| quoted |
| } |
| |
| fn quote_char(c: char) -> String { |
| let mut s = String::new(); |
| if parser::is_punct(c) { |
| s.push('\\'); |
| } |
| s.push(c); |
| s |
| } |
| |
| fn quote_byte(b: u8) -> String { |
| if parser::is_punct(b as char) || b == b'\'' || b == b'"' { |
| quote_char(b as char) |
| } else { |
| let escaped: Vec<u8> = ascii::escape_default(b).collect(); |
| String::from_utf8(escaped).unwrap() |
| } |
| } |
| |
| fn inc_char(c: char) -> char { |
| match c { |
| char::MAX => char::MAX, |
| '\u{D7FF}' => '\u{E000}', |
| c => char::from_u32(c as u32 + 1).unwrap(), |
| } |
| } |
| |
| fn dec_char(c: char) -> char { |
| match c { |
| '\x00' => '\x00', |
| '\u{E000}' => '\u{D7FF}', |
| c => char::from_u32(c as u32 - 1).unwrap(), |
| } |
| } |
| |
| /// Returns true if and only if `c` is a word character. |
| #[doc(hidden)] |
| pub fn is_word_char(c: char) -> bool { |
| match c { |
| '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, |
| _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| { |
| if c >= start && c <= end { |
| Ordering::Equal |
| } else if start > c { |
| Ordering::Greater |
| } else { |
| Ordering::Less |
| } |
| }).is_ok(), |
| } |
| } |
| |
| /// Returns true if and only if `c` is an ASCII word byte. |
| #[doc(hidden)] |
| pub fn is_word_byte(b: u8) -> bool { |
| match b { |
| b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true, |
| _ => false, |
| } |
| } |
| |
| #[cfg(test)] |
| mod properties; |
| |
| #[cfg(test)] |
| mod tests { |
| use {CharClass, ClassRange, ByteClass, ByteRange, Expr}; |
| |
| fn class(ranges: &[(char, char)]) -> CharClass { |
| let ranges = ranges.iter().cloned() |
| .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); |
| CharClass::new(ranges) |
| } |
| |
| fn bclass(ranges: &[(u8, u8)]) -> ByteClass { |
| let ranges = ranges.iter().cloned() |
| .map(|(c1, c2)| ByteRange::new(c1, c2)).collect(); |
| ByteClass::new(ranges) |
| } |
| |
| fn e(re: &str) -> Expr { Expr::parse(re).unwrap() } |
| |
| #[test] |
| fn stack_exhaustion() { |
| use std::iter::repeat; |
| |
| let open: String = repeat('(').take(200).collect(); |
| let close: String = repeat(')').take(200).collect(); |
| assert!(Expr::parse(&format!("{}a{}", open, close)).is_ok()); |
| |
| let open: String = repeat('(').take(200 + 1).collect(); |
| let close: String = repeat(')').take(200 + 1).collect(); |
| assert!(Expr::parse(&format!("{}a{}", open, close)).is_err()); |
| } |
| |
| #[test] |
| fn anchored_start() { |
| assert!(e("^a").is_anchored_start()); |
| assert!(e("(^a)").is_anchored_start()); |
| assert!(e("^a|^b").is_anchored_start()); |
| assert!(e("(^a)|(^b)").is_anchored_start()); |
| assert!(e("(^(a|b))").is_anchored_start()); |
| |
| assert!(!e("^a|b").is_anchored_start()); |
| assert!(!e("a|^b").is_anchored_start()); |
| } |
| |
| #[test] |
| fn anchored_end() { |
| assert!(e("a$").is_anchored_end()); |
| assert!(e("(a$)").is_anchored_end()); |
| assert!(e("a$|b$").is_anchored_end()); |
| assert!(e("(a$)|(b$)").is_anchored_end()); |
| assert!(e("((a|b)$)").is_anchored_end()); |
| |
| assert!(!e("a$|b").is_anchored_end()); |
| assert!(!e("a|b$").is_anchored_end()); |
| } |
| |
| #[test] |
| fn class_canon_no_change() { |
| let cls = class(&[('a', 'c'), ('x', 'z')]); |
| assert_eq!(cls.clone().canonicalize(), cls); |
| } |
| |
| #[test] |
| fn class_canon_unordered() { |
| let cls = class(&[('x', 'z'), ('a', 'c')]); |
| assert_eq!(cls.canonicalize(), class(&[ |
| ('a', 'c'), ('x', 'z'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_overlap() { |
| let cls = class(&[('x', 'z'), ('w', 'y')]); |
| assert_eq!(cls.canonicalize(), class(&[ |
| ('w', 'z'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_overlap_many() { |
| let cls = class(&[ |
| ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), |
| ('m', 'p'), ('l', 's'), |
| ]); |
| assert_eq!(cls.clone().canonicalize(), class(&[ |
| ('a', 'j'), ('l', 's'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_overlap_boundary() { |
| let cls = class(&[('x', 'z'), ('u', 'w')]); |
| assert_eq!(cls.canonicalize(), class(&[ |
| ('u', 'z'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_extreme_edge_case() { |
| let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); |
| assert_eq!(cls.canonicalize(), class(&[ |
| ('\x00', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_singles() { |
| let cls = class(&[('a', 'a'), ('b', 'b')]); |
| assert_eq!(cls.canonicalize(), class(&[('a', 'b')])); |
| } |
| |
| #[test] |
| fn class_negate_single() { |
| let cls = class(&[('a', 'a')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_singles() { |
| let cls = class(&[('a', 'a'), ('b', 'b')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\x60'), ('\x63', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_multiples() { |
| let cls = class(&[('a', 'c'), ('x', 'z')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_min_scalar() { |
| let cls = class(&[('\x00', 'a')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x62', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_max_scalar() { |
| let cls = class(&[('a', '\u{10FFFF}')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\x60'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_everything() { |
| let cls = class(&[('\x00', '\u{10FFFF}')]); |
| assert_eq!(cls.negate(), class(&[])); |
| } |
| |
| #[test] |
| fn class_negate_everything_sans_one() { |
| let cls = class(&[ |
| ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}') |
| ]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\u{10FFFE}', '\u{10FFFE}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_surrogates_min() { |
| let cls = class(&[('\x00', '\u{D7FF}')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\u{E000}', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_surrogates_min_edge() { |
| let cls = class(&[('\x00', '\u{D7FE}')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\u{D7FF}', '\u{10FFFF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_surrogates_max() { |
| let cls = class(&[('\u{E000}', '\u{10FFFF}')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\u{D7FF}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_negate_surrogates_max_edge() { |
| let cls = class(&[('\u{E001}', '\u{10FFFF}')]); |
| assert_eq!(cls.negate(), class(&[ |
| ('\x00', '\u{E000}'), |
| ])); |
| } |
| |
| #[test] |
| fn class_canon_overlap_many_case_fold() { |
| let cls = class(&[ |
| ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), |
| ('M', 'P'), ('L', 'S'), ('c', 'f'), |
| ]); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('A', 'J'), ('L', 'S'), |
| ('a', 'j'), ('l', 's'), |
| ('\u{17F}', '\u{17F}'), |
| ])); |
| |
| let cls = bclass(&[ |
| (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), |
| (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), |
| ]); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'A', b'J'), (b'L', b'S'), |
| (b'a', b'j'), (b'l', b's'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_az() { |
| let cls = class(&[('A', 'Z')]); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('A', 'Z'), ('a', 'z'), |
| ('\u{17F}', '\u{17F}'), |
| ('\u{212A}', '\u{212A}'), |
| ])); |
| let cls = class(&[('a', 'z')]); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('A', 'Z'), ('a', 'z'), |
| ('\u{17F}', '\u{17F}'), |
| ('\u{212A}', '\u{212A}'), |
| ])); |
| |
| let cls = bclass(&[(b'A', b'Z')]); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'A', b'Z'), (b'a', b'z'), |
| ])); |
| let cls = bclass(&[(b'a', b'z')]); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'A', b'Z'), (b'a', b'z'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_a_underscore() { |
| let cls = class(&[('A', 'A'), ('_', '_')]); |
| assert_eq!(cls.clone().canonicalize(), class(&[ |
| ('A', 'A'), ('_', '_'), |
| ])); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('A', 'A'), ('_', '_'), ('a', 'a'), |
| ])); |
| |
| let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); |
| assert_eq!(cls.clone().canonicalize(), bclass(&[ |
| (b'A', b'A'), (b'_', b'_'), |
| ])); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'A', b'A'), (b'_', b'_'), (b'a', b'a'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_a_equals() { |
| let cls = class(&[('A', 'A'), ('=', '=')]); |
| assert_eq!(cls.clone().canonicalize(), class(&[ |
| ('=', '='), ('A', 'A'), |
| ])); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('=', '='), ('A', 'A'), ('a', 'a'), |
| ])); |
| |
| let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); |
| assert_eq!(cls.clone().canonicalize(), bclass(&[ |
| (b'=', b'='), (b'A', b'A'), |
| ])); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'=', b'='), (b'A', b'A'), (b'a', b'a'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_no_folding_needed() { |
| let cls = class(&[('\x00', '\x10')]); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('\x00', '\x10'), |
| ])); |
| |
| let cls = bclass(&[(b'\x00', b'\x10')]); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'\x00', b'\x10'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_negated() { |
| let cls = class(&[('x', 'x')]); |
| assert_eq!(cls.clone().case_fold(), class(&[ |
| ('X', 'X'), ('x', 'x'), |
| ])); |
| assert_eq!(cls.case_fold().negate(), class(&[ |
| ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'), |
| ])); |
| |
| let cls = bclass(&[(b'x', b'x')]); |
| assert_eq!(cls.clone().case_fold(), bclass(&[ |
| (b'X', b'X'), (b'x', b'x'), |
| ])); |
| assert_eq!(cls.case_fold().negate(), bclass(&[ |
| (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_single_to_multiple() { |
| let cls = class(&[('k', 'k')]); |
| assert_eq!(cls.case_fold(), class(&[ |
| ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), |
| ])); |
| |
| let cls = bclass(&[(b'k', b'k')]); |
| assert_eq!(cls.case_fold(), bclass(&[ |
| (b'K', b'K'), (b'k', b'k'), |
| ])); |
| } |
| |
| #[test] |
| fn class_fold_at() { |
| let cls = class(&[('@', '@')]); |
| assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')])); |
| assert_eq!(cls.case_fold(), class(&[('@', '@')])); |
| |
| let cls = bclass(&[(b'@', b'@')]); |
| assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')])); |
| assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')])); |
| } |
| |
| #[test] |
| fn roundtrip_class_hypen() { |
| let expr = e("[-./]"); |
| assert_eq!("(?u:[-\\.-/])", expr.to_string()); |
| |
| let expr = e("(?-u)[-./]"); |
| assert_eq!("(?-u:[-\\.-/])", expr.to_string()); |
| } |
| } |