| // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| use std::cmp::{max, min}; |
| use std::u8; |
| |
| use unicode::regex::UNICODE_CLASSES; |
| |
| use { |
| Expr, Repeater, CharClass, ClassRange, |
| CaptureIndex, CaptureName, |
| Error, ErrorKind, Result, |
| }; |
| |
| /// Parser state. |
| /// |
| /// Keeps the entire input in memory and maintains a cursor (char offset). |
| /// |
| /// It also keeps an expression stack, which is responsible for managing |
| /// grouped expressions and flag state. |
| #[derive(Debug)] |
| pub struct Parser { |
| chars: Vec<char>, |
| chari: usize, |
| stack: Vec<Build>, |
| caps: usize, |
| names: Vec<String>, // to check for duplicates |
| flags: Flags, |
| } |
| |
| /// Flag state used in the parser. |
| #[derive(Clone, Copy, Debug)] |
| pub struct Flags { |
| /// i |
| pub casei: bool, |
| /// m |
| pub multi: bool, |
| /// s |
| pub dotnl: bool, |
| /// U |
| pub swap_greed: bool, |
| /// x |
| pub ignore_space: bool, |
| /// u |
| pub unicode: bool, |
| /// Not actually a flag, but when disabled, every regex that may not match |
| /// UTF-8 exclusively will cause the parser to return an error. |
| pub allow_bytes: bool, |
| } |
| |
| impl Default for Flags { |
| fn default() -> Self { |
| Flags { |
| casei: false, |
| multi: false, |
| dotnl: false, |
| swap_greed: false, |
| ignore_space: false, |
| unicode: true, |
| allow_bytes: false, |
| } |
| } |
| } |
| |
| /// An ephemeral type for representing the expression stack. |
| /// |
| /// Everything on the stack is either a regular expression or a marker |
| /// indicating the opening of a group (possibly non-capturing). The opening |
| /// of a group copies the current flag state, which is reset on the parser |
| /// state once the group closes. |
| #[derive(Debug)] |
| enum Build { |
| Expr(Expr), |
| LeftParen { |
| i: CaptureIndex, |
| name: CaptureName, |
| chari: usize, |
| old_flags: Flags, |
| }, |
| } |
| |
| // Primary expression parsing routines. |
| impl Parser { |
| pub fn parse(s: &str, flags: Flags) -> Result<Expr> { |
| Parser { |
| chars: s.chars().collect(), |
| chari: 0, |
| stack: vec![], |
| caps: 0, |
| names: vec![], |
| flags: flags, |
| }.parse_expr() |
| } |
| |
| // Top-level expression parser. |
| // |
| // Starts at the beginning of the input and consumes until either the end |
| // of input or an error. |
| fn parse_expr(mut self) -> Result<Expr> { |
| while !self.eof() { |
| let build_expr = match self.cur() { |
| '\\' => try!(self.parse_escape()), |
| '|' => { let e = try!(self.alternate()); self.bump(); e } |
| '?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)), |
| '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)), |
| '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)), |
| '{' => try!(self.parse_counted_repeat()), |
| '[' => match self.maybe_parse_ascii() { |
| None => try!(self.parse_class()), |
| Some(cls) => Build::Expr(Expr::Class(cls)), |
| }, |
| '^' => { |
| if self.flags.multi { |
| self.parse_one(Expr::StartLine) |
| } else { |
| self.parse_one(Expr::StartText) |
| } |
| } |
| '$' => { |
| if self.flags.multi { |
| self.parse_one(Expr::EndLine) |
| } else { |
| self.parse_one(Expr::EndText) |
| } |
| } |
| '.' => { |
| if self.flags.dotnl { |
| if self.flags.unicode { |
| self.parse_one(Expr::AnyChar) |
| } else { |
| if !self.flags.allow_bytes { |
| return Err(self.err(ErrorKind::InvalidUtf8)); |
| } |
| self.parse_one(Expr::AnyByte) |
| } |
| } else { |
| if self.flags.unicode { |
| self.parse_one(Expr::AnyCharNoNL) |
| } else { |
| if !self.flags.allow_bytes { |
| return Err(self.err(ErrorKind::InvalidUtf8)); |
| } |
| self.parse_one(Expr::AnyByteNoNL) |
| } |
| } |
| } |
| '(' => try!(self.parse_group()), |
| ')' => { |
| let (old_flags, e) = try!(self.close_paren()); |
| self.bump(); |
| self.flags = old_flags; |
| e |
| } |
| _ => { |
| let c = self.bump(); |
| try!(self.lit(c)) |
| } |
| }; |
| if !build_expr.is_empty() { |
| self.stack.push(build_expr); |
| } |
| } |
| self.finish_concat() |
| } |
| |
| // Parses an escape sequence, e.g., \Ax |
| // |
| // Start: `\` |
| // End: `x` |
| fn parse_escape(&mut self) -> Result<Build> { |
| self.bump(); |
| if self.eof() { |
| return Err(self.err(ErrorKind::UnexpectedEscapeEof)); |
| } |
| let c = self.cur(); |
| if is_punct(c) { |
| let c = self.bump(); |
| return Ok(try!(self.lit(c))); |
| } |
| match c { |
| 'a' => { self.bump(); Ok(try!(self.lit('\x07'))) } |
| 'f' => { self.bump(); Ok(try!(self.lit('\x0C'))) } |
| 't' => { self.bump(); Ok(try!(self.lit('\t'))) } |
| 'n' => { self.bump(); Ok(try!(self.lit('\n'))) } |
| 'r' => { self.bump(); Ok(try!(self.lit('\r'))) } |
| 'v' => { self.bump(); Ok(try!(self.lit('\x0B'))) } |
| 'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) } |
| 'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) } |
| 'b' => { |
| self.bump(); |
| Ok(Build::Expr(if self.flags.unicode { |
| Expr::WordBoundary |
| } else { |
| Expr::WordBoundaryAscii |
| })) |
| } |
| 'B' => { |
| self.bump(); |
| Ok(Build::Expr(if self.flags.unicode { |
| Expr::NotWordBoundary |
| } else { |
| Expr::NotWordBoundaryAscii |
| })) |
| } |
| '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(), |
| 'x' => { self.bump(); self.parse_hex() } |
| 'p'|'P' => { |
| self.bump(); |
| self.parse_unicode_class(c == 'P') |
| .map(|cls| Build::Expr(Expr::Class(cls))) |
| } |
| 'd'|'s'|'w'|'D'|'S'|'W' => { |
| self.bump(); |
| Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) |
| } |
| c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), |
| } |
| } |
| |
| // Parses a group, e.g., `(abc)`. |
| // |
| // Start: `(` |
| // End: `a` |
| // |
| // A more interesting example, `(?P<foo>abc)`. |
| // |
| // Start: `(` |
| // End: `a` |
| fn parse_group(&mut self) -> Result<Build> { |
| let chari = self.chari; |
| let mut name: CaptureName = None; |
| self.bump(); |
| if self.bump_if("?P<") { |
| let n = try!(self.parse_group_name()); |
| if self.names.iter().any(|n2| n2 == &n) { |
| return Err(self.err(ErrorKind::DuplicateCaptureName(n))); |
| } |
| self.names.push(n.clone()); |
| name = Some(n); |
| } else if self.bump_if("?") { |
| // This can never be capturing. It's either setting flags for |
| // the current group, or it's opening a non-capturing group or |
| // it's opening a group with a specific set of flags (which is |
| // also non-capturing). |
| // Anything else is an error. |
| return self.parse_group_flags(chari); |
| } |
| self.caps = checkadd(self.caps, 1); |
| Ok(Build::LeftParen { |
| i: Some(self.caps), |
| name: name, |
| chari: chari, |
| old_flags: self.flags, // no flags changed if we're here |
| }) |
| } |
| |
| // Parses flags (inline or grouped), e.g., `(?s-i:abc)`. |
| // |
| // Start: `s` |
| // End: `a` |
| // |
| // Another example, `(?s-i)a`. |
| // |
| // Start: `s` |
| // End: `a` |
| fn parse_group_flags(&mut self, opening_chari: usize) -> Result<Build> { |
| let old_flags = self.flags; |
| let mut sign = true; |
| let mut saw_flag = false; |
| loop { |
| if self.eof() { |
| // e.g., (?i |
| return Err(self.err(ErrorKind::UnexpectedFlagEof)); |
| } |
| match self.cur() { |
| 'i' => { self.flags.casei = sign; saw_flag = true } |
| 'm' => { self.flags.multi = sign; saw_flag = true } |
| 's' => { self.flags.dotnl = sign; saw_flag = true } |
| 'U' => { self.flags.swap_greed = sign; saw_flag = true } |
| 'x' => { self.flags.ignore_space = sign; saw_flag = true } |
| 'u' => { self.flags.unicode = sign; saw_flag = true } |
| '-' => { |
| if !sign { |
| // e.g., (?-i-s) |
| return Err(self.err(ErrorKind::DoubleFlagNegation)); |
| } |
| sign = false; |
| saw_flag = false; |
| } |
| ')' => { |
| if !saw_flag { |
| // e.g., (?) |
| return Err(self.err(ErrorKind::EmptyFlagNegation)); |
| } |
| // At this point, we're just changing the flags inside |
| // the current group, which means the old flags have |
| // been saved elsewhere. Our modifications in place are |
| // okey dokey! |
| // |
| // This particular flag expression only has a stateful |
| // impact on a regex's AST, so nothing gets explicitly |
| // added. |
| self.bump(); |
| return Ok(Build::Expr(Expr::Empty)); |
| } |
| ':' => { |
| if !sign && !saw_flag { |
| // e.g., (?i-:a) |
| // Note that if there's no negation, it's OK not |
| // to see flag, because you end up with a regular |
| // non-capturing group: `(?:a)`. |
| return Err(self.err(ErrorKind::EmptyFlagNegation)); |
| } |
| self.bump(); |
| return Ok(Build::LeftParen { |
| i: None, |
| name: None, |
| chari: opening_chari, |
| old_flags: old_flags, |
| }); |
| } |
| // e.g., (?z:a) |
| c => return Err(self.err(ErrorKind::UnrecognizedFlag(c))), |
| } |
| self.bump(); |
| } |
| } |
| |
| // Parses a group name, e.g., `foo` in `(?P<foo>abc)`. |
| // |
| // Start: `f` |
| // End: `a` |
| fn parse_group_name(&mut self) -> Result<String> { |
| let mut name = String::new(); |
| while !self.eof() && !self.peek_is('>') { |
| name.push(self.bump()); |
| } |
| if self.eof() { |
| // e.g., (?P<a |
| return Err(self.err(ErrorKind::UnclosedCaptureName(name))); |
| } |
| let all_valid = name.chars().all(is_valid_capture_char); |
| match name.chars().next() { |
| // e.g., (?P<>a) |
| None => Err(self.err(ErrorKind::EmptyCaptureName)), |
| Some(c) if (c >= '0' && c <= '9') || !all_valid => { |
| // e.g., (?P<a#>x) |
| // e.g., (?P<1a>x) |
| Err(self.err(ErrorKind::InvalidCaptureName(name))) |
| } |
| _ => { |
| self.bump(); // for `>` |
| Ok(name) |
| } |
| } |
| } |
| |
| // Parses a counted repeition operator, e.g., `a{2,4}?z`. |
| // |
| // Start: `{` |
| // End: `z` |
| fn parse_counted_repeat(&mut self) -> Result<Build> { |
| let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., ({5} |
| if !e.can_repeat() { |
| // e.g., a*{5} |
| return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); |
| } |
| self.bump(); |
| let min = try!(self.parse_decimal(|c| c != ',' && c != '}')); |
| let mut max_opt = Some(min); |
| if self.bump_if(',') { |
| if self.peek_is('}') { |
| max_opt = None; |
| } else { |
| let max = try!(self.parse_decimal(|c| c != '}')); |
| if min > max { |
| // e.g., a{2,1} |
| return Err(self.err(ErrorKind::InvalidRepeatRange { |
| min: min, |
| max: max, |
| })); |
| } |
| max_opt = Some(max); |
| } |
| } |
| if !self.bump_if('}') { |
| Err(self.err(ErrorKind::UnclosedRepeat)) |
| } else { |
| Ok(Build::Expr(Expr::Repeat { |
| e: Box::new(e), |
| r: Repeater::Range { min: min, max: max_opt }, |
| greedy: !self.bump_if('?') ^ self.flags.swap_greed, |
| })) |
| } |
| } |
| |
| // Parses a simple repetition operator, e.g., `a+?z`. |
| // |
| // Start: `+` |
| // End: `z` |
| // |
| // N.B. "simple" in this context means "not min/max repetition", |
| // e.g., `a{1,2}`. |
| fn parse_simple_repeat(&mut self, rep: Repeater) -> Result<Build> { |
| let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., (* |
| if !e.can_repeat() { |
| // e.g., a** |
| return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); |
| } |
| self.bump(); |
| Ok(Build::Expr(Expr::Repeat { |
| e: Box::new(e), |
| r: rep, |
| greedy: !self.bump_if('?') ^ self.flags.swap_greed, |
| })) |
| } |
| |
| // Parses a decimal number until the given character, e.g., `a{123,456}`. |
| // |
| // Start: `1` |
| // End: `,` (where `until == ','`) |
| fn parse_decimal<B: Bumpable>(&mut self, until: B) -> Result<u32> { |
| match self.bump_get(until) { |
| // e.g., a{} |
| None => Err(self.err(ErrorKind::MissingBase10)), |
| Some(n) => { |
| // e.g., a{xyz |
| // e.g., a{9999999999} |
| let n = n.trim(); |
| u32::from_str_radix(n, 10) |
| .map_err(|_| self.err(ErrorKind::InvalidBase10(n.into()))) |
| } |
| } |
| } |
| |
| // Parses an octal number, up to 3 digits, e.g., `a\123b` |
| // |
| // Start: `1` |
| // End: `b` |
| fn parse_octal(&mut self) -> Result<Build> { |
| use std::char; |
| let mut i = 0; // counter for limiting octal to 3 digits. |
| let n = self.bump_get(|c| { i += 1; i <= 3 && c >= '0' && c <= '7' }) |
| .expect("octal string"); // guaranteed at least 1 digit |
| // I think both of the following unwraps are impossible to fail. |
| // We limit it to a three digit octal number, which maxes out at |
| // `0777` or `511` in decimal. Since all digits are in `0...7`, we'll |
| // always have a valid `u32` number. Moreover, since all numbers in |
| // the range `0...511` are valid Unicode scalar values, it will always |
| // be a valid `char`. |
| // |
| // Hence, we `unwrap` with reckless abandon. |
| let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number"); |
| if !self.flags.unicode { |
| return Ok(try!(self.u32_to_one_byte(n))); |
| } |
| let c = char::from_u32(n).expect("Unicode scalar value"); |
| Ok(try!(self.lit(c))) |
| } |
| |
| // Parses a hex number, e.g., `a\x5ab`. |
| // |
| // Start: `5` |
| // End: `b` |
| // |
| // And also, `a\x{2603}b`. |
| // |
| // Start: `{` |
| // End: `b` |
| fn parse_hex(&mut self) -> Result<Build> { |
| if self.bump_if('{') { |
| self.parse_hex_many_digits() |
| } else { |
| self.parse_hex_two_digits() |
| } |
| } |
| |
| // Parses a many-digit hex number, e.g., `a\x{2603}b`. |
| // |
| // Start: `2` |
| // End: `b` |
| fn parse_hex_many_digits(&mut self) -> Result<Build> { |
| use std::char; |
| |
| let s = self.bump_get(|c| c != '}').unwrap_or("".into()); |
| let n = try!(u32::from_str_radix(&s, 16) |
| .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); |
| if !self.bump_if('}') { |
| // e.g., a\x{d |
| return Err(self.err(ErrorKind::UnclosedHex)); |
| } |
| if !self.flags.unicode { |
| return Ok(try!(self.u32_to_one_byte(n))); |
| } |
| let c = try!(char::from_u32(n) |
| .ok_or(self.err(ErrorKind::InvalidScalarValue(n)))); |
| Ok(try!(self.lit(c))) |
| } |
| |
| // Parses a two-digit hex number, e.g., `a\x5ab`. |
| // |
| // Start: `5` |
| // End: `b` |
| fn parse_hex_two_digits(&mut self) -> Result<Build> { |
| use std::char; |
| |
| let mut i = 0; |
| let s = self.bump_get(|_| { i += 1; i <= 2 }).unwrap_or("".into()); |
| if s.len() < 2 { |
| // e.g., a\x |
| // e.g., a\xf |
| return Err(self.err(ErrorKind::UnexpectedTwoDigitHexEof)); |
| } |
| let n = try!(u32::from_str_radix(&s, 16) |
| .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); |
| if !self.flags.unicode { |
| return Ok(try!(self.u32_to_one_byte(n))); |
| } |
| let c = char::from_u32(n).expect("Unicode scalar value"); |
| Ok(try!(self.lit(c))) |
| } |
| |
| // Parses a character class, e.g., `[^a-zA-Z0-9]+`. |
| // |
| // Start: `[` |
| // End: `+` |
| fn parse_class(&mut self) -> Result<Build> { |
| self.bump(); |
| let negated = self.bump_if('^'); |
| let mut class = CharClass::empty(); |
| while self.bump_if('-') { |
| class.ranges.push(ClassRange::one('-')); |
| } |
| loop { |
| if self.eof() { |
| // e.g., [a |
| return Err(self.err(ErrorKind::UnexpectedClassEof)); |
| } |
| match self.cur() { |
| // If no ranges have been added, then `]` is the first |
| // character (sans, perhaps, the `^` symbol), so it should |
| // be interpreted as a `]` instead of a closing class bracket. |
| ']' if class.len() > 0 => { self.bump(); break } |
| '[' => match self.maybe_parse_ascii() { |
| Some(class2) => class.ranges.extend(class2), |
| None => { |
| self.bump(); |
| try!(self.parse_class_range(&mut class, '[')) |
| } |
| }, |
| '\\' => match try!(self.parse_escape()) { |
| Build::Expr(Expr::Class(class2)) => { |
| class.ranges.extend(class2); |
| } |
| Build::Expr(Expr::ClassBytes(class2)) => { |
| for byte_range in class2 { |
| let s = byte_range.start as char; |
| let e = byte_range.end as char; |
| class.ranges.push(ClassRange::new(s, e)); |
| } |
| } |
| Build::Expr(Expr::Literal { chars, .. }) => { |
| try!(self.parse_class_range(&mut class, chars[0])); |
| } |
| Build::Expr(Expr::LiteralBytes { bytes, .. }) => { |
| let start = bytes[0] as char; |
| try!(self.parse_class_range(&mut class, start)); |
| } |
| Build::Expr(e) => { |
| let err = ErrorKind::InvalidClassEscape(e); |
| return Err(self.err(err)); |
| } |
| // Because `parse_escape` can never return `LeftParen`. |
| _ => unreachable!(), |
| }, |
| start => { |
| if !self.flags.unicode { |
| let _ = try!(self.codepoint_to_one_byte(start)); |
| } |
| self.bump(); |
| try!(self.parse_class_range(&mut class, start)); |
| } |
| } |
| } |
| class = self.class_transform(negated, class).canonicalize(); |
| if class.is_empty() { |
| return Err(self.err(ErrorKind::EmptyClass)); |
| } |
| Ok(Build::Expr(if self.flags.unicode { |
| Expr::Class(class) |
| } else { |
| Expr::ClassBytes(class.to_byte_class()) |
| })) |
| } |
| |
| // Parses a single range in a character class. |
| // |
| // Since this is a helper for `parse_class`, its signature sticks out. |
| // Namely, it requires the start character of the range and the char |
| // class to mutate. |
| // |
| // e.g., `[a-z]` |
| // |
| // Start: `-` (with start == `a`) |
| // End: `]` |
| fn parse_class_range(&mut self, class: &mut CharClass, start: char) |
| -> Result<()> { |
| if !self.bump_if('-') { |
| // Not a range, so just push a singleton range. |
| class.ranges.push(ClassRange::one(start)); |
| return Ok(()); |
| } |
| if self.eof() { |
| // e.g., [a- |
| return Err(self.err(ErrorKind::UnexpectedClassEof)); |
| } |
| if self.peek_is(']') { |
| // This is the end of the class, so we permit use of `-` as a |
| // regular char (just like we do in the beginning). |
| class.ranges.push(ClassRange::one(start)); |
| class.ranges.push(ClassRange::one('-')); |
| return Ok(()); |
| } |
| |
| // We have a real range. Just need to check to parse literal and |
| // make sure it's a valid range. |
| let end = match self.cur() { |
| '\\' => match try!(self.parse_escape()) { |
| Build::Expr(Expr::Literal { chars, .. }) => { |
| chars[0] |
| } |
| Build::Expr(Expr::LiteralBytes { bytes, .. }) => { |
| bytes[0] as char |
| } |
| Build::Expr(e) => { |
| return Err(self.err(ErrorKind::InvalidClassEscape(e))); |
| } |
| // Because `parse_escape` can never return `LeftParen`. |
| _ => unreachable!(), |
| }, |
| _ => { |
| let c = self.bump(); |
| if !self.flags.unicode { |
| let _ = try!(self.codepoint_to_one_byte(c)); |
| } |
| c |
| } |
| }; |
| if end < start { |
| // e.g., [z-a] |
| return Err(self.err(ErrorKind::InvalidClassRange { |
| start: start, |
| end: end, |
| })); |
| } |
| class.ranges.push(ClassRange::new(start, end)); |
| Ok(()) |
| } |
| |
| // Parses an ASCII class, e.g., `[:alnum:]+`. |
| // |
| // Start: `[` |
| // End: `+` |
| // |
| // Also supports negation, e.g., `[:^alnum:]`. |
| // |
| // This parsing routine is distinct from the others in that it doesn't |
| // actually report any errors. Namely, if it fails, then the parser should |
| // fall back to parsing a regular class. |
| // |
| // This method will only make progress in the parser if it succeeds. |
| // Otherwise, the input remains where it started. |
| fn maybe_parse_ascii(&mut self) -> Option<CharClass> { |
| fn parse(p: &mut Parser) -> Option<CharClass> { |
| p.bump(); // the `[` |
| if !p.bump_if(':') { return None; } |
| let negate = p.bump_if('^'); |
| let name = match p.bump_get(|c| c != ':') { |
| None => return None, |
| Some(name) => name, |
| }; |
| if !p.bump_if(":]") { return None; } |
| ascii_class(&name).map(|cls| p.class_transform(negate, cls)) |
| } |
| let start = self.chari; |
| match parse(self) { |
| None => { self.chari = start; None } |
| result => result, |
| } |
| } |
| |
| // Parses a Uncode class name, e.g., `a\pLb`. |
| // |
| // Start: `L` |
| // End: `b` |
| // |
| // And also, `a\p{Greek}b`. |
| // |
| // Start: `{` |
| // End: `b` |
| // |
| // `negate` is true when the class name is used with `\P`. |
| fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> { |
| let name = |
| if self.bump_if('{') { |
| let n = self.bump_get(|c| c != '}').unwrap_or("".into()); |
| if n.is_empty() || !self.bump_if('}') { |
| // e.g., \p{Greek |
| return Err(self.err(ErrorKind::UnclosedUnicodeName)); |
| } |
| n |
| } else { |
| if self.eof() { |
| // e.g., \p |
| return Err(self.err(ErrorKind::UnexpectedEscapeEof)); |
| } |
| self.bump().to_string() |
| }; |
| match unicode_class(&name) { |
| None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))), |
| Some(cls) => { |
| if self.flags.unicode { |
| Ok(self.class_transform(neg, cls)) |
| } else { |
| Err(self.err(ErrorKind::UnicodeNotAllowed)) |
| } |
| } |
| } |
| } |
| |
| // Parses a perl character class with Unicode support. |
| // |
| // `name` must be one of d, s, w, D, S, W. If not, this function panics. |
| // |
| // No parser state is changed. |
| fn parse_perl_class(&mut self, name: char) -> CharClass { |
| use unicode::regex::{PERLD, PERLS, PERLW}; |
| let (cls, negate) = match (self.flags.unicode, name) { |
| (true, 'd') => (raw_class_to_expr(PERLD), false), |
| (true, 'D') => (raw_class_to_expr(PERLD), true), |
| (true, 's') => (raw_class_to_expr(PERLS), false), |
| (true, 'S') => (raw_class_to_expr(PERLS), true), |
| (true, 'w') => (raw_class_to_expr(PERLW), false), |
| (true, 'W') => (raw_class_to_expr(PERLW), true), |
| (false, 'd') => (ascii_class("digit").unwrap(), false), |
| (false, 'D') => (ascii_class("digit").unwrap(), true), |
| (false, 's') => (ascii_class("space").unwrap(), false), |
| (false, 'S') => (ascii_class("space").unwrap(), true), |
| (false, 'w') => (ascii_class("word").unwrap(), false), |
| (false, 'W') => (ascii_class("word").unwrap(), true), |
| _ => unreachable!(), |
| }; |
| self.class_transform(negate, cls) |
| } |
| |
| // Always bump to the next input and return the given expression as a |
| // `Build`. |
| // |
| // This is mostly for convenience when the surrounding context implies |
| // that the next character corresponds to the given expression. |
| fn parse_one(&mut self, e: Expr) -> Build { |
| self.bump(); |
| Build::Expr(e) |
| } |
| } |
| |
| // Auxiliary helper methods. |
| impl Parser { |
| fn chars(&self) -> Chars { |
| Chars::new(&self.chars[self.chari..], self.flags.ignore_space) |
| } |
| |
| fn bump(&mut self) -> char { |
| let c = self.cur(); |
| self.chari = checkadd(self.chari, self.chars().next_count()); |
| c |
| } |
| |
| fn cur(&self) -> char { self.chars().next().unwrap() } |
| |
| fn eof(&self) -> bool { self.chars().next().is_none() } |
| |
| fn bump_get<B: Bumpable>(&mut self, s: B) -> Option<String> { |
| let n = s.match_end(self); |
| if n == 0 { |
| None |
| } else { |
| let end = checkadd(self.chari, n); |
| let s = self.chars[self.chari..end] |
| .iter().cloned().collect::<String>(); |
| self.chari = end; |
| Some(s) |
| } |
| } |
| |
| fn bump_if<B: Bumpable>(&mut self, s: B) -> bool { |
| let n = s.match_end(self); |
| if n == 0 { |
| false |
| } else { |
| self.chari = checkadd(self.chari, n); |
| true |
| } |
| } |
| |
| fn peek_is<B: Bumpable>(&self, s: B) -> bool { |
| s.match_end(self) > 0 |
| } |
| |
| fn err(&self, kind: ErrorKind) -> Error { |
| self.errat(self.chari, kind) |
| } |
| |
| fn errat(&self, pos: usize, kind: ErrorKind) -> Error { |
| Error { pos: pos, surround: self.windowat(pos), kind: kind } |
| } |
| |
| fn windowat(&self, pos: usize) -> String { |
| let s = max(5, pos) - 5; |
| let e = min(self.chars.len(), checkadd(pos, 5)); |
| self.chars[s..e].iter().cloned().collect() |
| } |
| |
| fn pop(&mut self, expected: ErrorKind) -> Result<Expr> { |
| match self.stack.pop() { |
| None | Some(Build::LeftParen{..}) => Err(self.err(expected)), |
| Some(Build::Expr(e)) => Ok(e), |
| } |
| } |
| |
| // If the current context calls for case insensitivity, then apply |
| // case folding. Similarly, if `negate` is `true`, then negate the |
| // class. (Negation always proceeds case folding.) |
| fn class_transform(&self, negate: bool, mut cls: CharClass) -> CharClass { |
| if self.flags.casei { |
| cls = cls.case_fold(); |
| } |
| if negate { |
| cls = cls.negate(); |
| } |
| cls |
| } |
| |
| // Translates a Unicode codepoint into a single UTF-8 byte, and returns an |
| // error if it's not possible. |
| // |
| // This will panic if self.flags.unicode == true. |
| fn codepoint_to_one_byte(&self, c: char) -> Result<u8> { |
| assert!(!self.flags.unicode); |
| let bytes = c.to_string().as_bytes().to_owned(); |
| if bytes.len() > 1 { |
| return Err(self.err(ErrorKind::UnicodeNotAllowed)); |
| } |
| Ok(bytes[0]) |
| } |
| |
| // Creates a new byte literal from a single byte. |
| // |
| // If the given number can't fit into a single byte, then it is assumed |
| // to be a Unicode codepoint and an error is returned. |
| // |
| // This should only be called when the bytes flag is enabled. |
| fn u32_to_one_byte(&self, b: u32) -> Result<Build> { |
| assert!(!self.flags.unicode); |
| if b > u8::MAX as u32 { |
| Err(self.err(ErrorKind::UnicodeNotAllowed)) |
| } else if !self.flags.allow_bytes && b > 0x7F { |
| Err(self.err(ErrorKind::InvalidUtf8)) |
| } else { |
| Ok(Build::Expr(Expr::LiteralBytes { |
| bytes: vec![b as u8], |
| casei: self.flags.casei, |
| })) |
| } |
| } |
| |
| // Creates a new literal expr from a Unicode codepoint. |
| // |
| // Creates a byte literal if the `bytes` flag is set. |
| fn lit(&self, c: char) -> Result<Build> { |
| Ok(Build::Expr(if self.flags.unicode { |
| Expr::Literal { |
| chars: vec![c], |
| casei: self.flags.casei, |
| } |
| } else { |
| Expr::LiteralBytes { |
| bytes: vec![try!(self.codepoint_to_one_byte(c))], |
| casei: self.flags.casei, |
| } |
| })) |
| } |
| } |
| |
| struct Chars<'a> { |
| chars: &'a [char], |
| cur: usize, |
| ignore_space: bool, |
| } |
| |
| impl<'a> Iterator for Chars<'a> { |
| type Item = char; |
| fn next(&mut self) -> Option<char> { |
| if !self.ignore_space { |
| let x = self.c(); |
| self.advance(); |
| return x; |
| } |
| while let Some(c) = self.c() { |
| self.advance(); |
| match c { |
| '\\' => return match self.c() { |
| Some('#') => {self.advance(); Some('#')} |
| _ => Some('\\') |
| }, |
| '#' => loop { |
| match self.c() { |
| Some(c) => { |
| self.advance(); |
| if c == '\n' { |
| break; |
| } |
| }, |
| None => return None |
| } |
| }, |
| _ => if !c.is_whitespace() {return Some(c);} |
| } |
| } |
| None |
| } |
| } |
| |
| impl<'a> Chars<'a> { |
| fn new(chars: &[char], ignore_space: bool) -> Chars { |
| Chars { |
| chars: chars, |
| cur: 0, |
| ignore_space: ignore_space, |
| } |
| } |
| |
| fn c(&self) -> Option<char> { |
| self.chars.get(self.cur).map(|&c| c) |
| } |
| |
| fn advance(&mut self) { |
| self.cur = checkadd(self.cur, 1); |
| } |
| |
| fn next_count(&mut self) -> usize { |
| self.next(); |
| self.cur |
| } |
| } |
| |
| // Auxiliary methods for manipulating the expression stack. |
| impl Parser { |
| // Called whenever an alternate (`|`) is found. |
| // |
| // This pops the expression stack until: |
| // |
| // 1. The stack is empty. Pushes an alternation with one arm. |
| // 2. An opening parenthesis is found. Leave the parenthesis |
| // on the stack and push an alternation with one arm. |
| // 3. An alternate (`|`) is found. Pop the existing alternation, |
| // add an arm and push the modified alternation. |
| // |
| // Each "arm" in the above corresponds to the concatenation of all |
| // popped expressions. |
| // |
| // In the first two cases, the stack is left in an invalid state |
| // because an alternation with one arm is not allowed. This |
| // particular state will be detected by `finish_concat` and an |
| // error will be reported. |
| // |
| // In none of the cases is an empty arm allowed. If an empty arm |
| // is found, an error is reported. |
| fn alternate(&mut self) -> Result<Build> { |
| let mut concat = vec![]; |
| let alts = |es| Ok(Build::Expr(Expr::Alternate(es))); |
| loop { |
| match self.stack.pop() { |
| None => { |
| if concat.is_empty() { |
| // e.g., |a |
| return Err(self.err(ErrorKind::EmptyAlternate)); |
| } |
| return alts(vec![rev_concat(concat)]); |
| } |
| Some(e @ Build::LeftParen{..}) => { |
| if concat.is_empty() { |
| // e.g., (|a) |
| return Err(self.err(ErrorKind::EmptyAlternate)); |
| } |
| self.stack.push(e); |
| return alts(vec![rev_concat(concat)]); |
| } |
| Some(Build::Expr(Expr::Alternate(mut es))) => { |
| if concat.is_empty() { |
| // e.g., a|| |
| return Err(self.err(ErrorKind::EmptyAlternate)); |
| } |
| es.push(rev_concat(concat)); |
| return alts(es); |
| } |
| Some(Build::Expr(e)) => { concat.push(e); } |
| } |
| } |
| } |
| |
| // Called whenever a closing parenthesis (`)`) is found. |
| // |
| // This pops the expression stack until: |
| // |
| // 1. The stack is empty. An error is reported because this |
| // indicates an unopened parenthesis. |
| // 2. An opening parenthesis is found. Pop the opening parenthesis |
| // and push a `Group` expression. |
| // 3. An alternate (`|`) is found. Pop the existing alternation |
| // and an arm to it in place. Pop one more item from the stack. |
| // If the stack was empty, then report an unopened parenthesis |
| // error, otherwise assume it is an opening parenthesis and |
| // push a `Group` expression with the popped alternation. |
| // (We can assume this is an opening parenthesis because an |
| // alternation either corresponds to the entire Regex or it |
| // corresponds to an entire group. This is guaranteed by the |
| // `alternate` method.) |
| // |
| // Each "arm" in the above corresponds to the concatenation of all |
| // popped expressions. |
| // |
| // Empty arms nor empty groups are allowed. |
| fn close_paren(&mut self) -> Result<(Flags, Build)> { |
| let mut concat = vec![]; |
| loop { |
| match self.stack.pop() { |
| // e.g., ) |
| None => return Err(self.err(ErrorKind::UnopenedParen)), |
| Some(Build::LeftParen { i, name, old_flags, .. }) => { |
| if concat.is_empty() { |
| // e.g., () |
| return Err(self.err(ErrorKind::EmptyGroup)); |
| } |
| return Ok((old_flags, Build::Expr(Expr::Group { |
| e: Box::new(rev_concat(concat)), |
| i: i, |
| name: name, |
| }))); |
| } |
| Some(Build::Expr(Expr::Alternate(mut es))) => { |
| if concat.is_empty() { |
| // e.g., (a|) |
| return Err(self.err(ErrorKind::EmptyAlternate)); |
| } |
| es.push(rev_concat(concat)); |
| match self.stack.pop() { |
| // e.g., a|b) |
| None => return Err(self.err(ErrorKind::UnopenedParen)), |
| Some(Build::Expr(_)) => unreachable!(), |
| Some(Build::LeftParen { i, name, old_flags, .. }) => { |
| return Ok((old_flags, Build::Expr(Expr::Group { |
| e: Box::new(Expr::Alternate(es)), |
| i: i, |
| name: name, |
| }))); |
| } |
| } |
| } |
| Some(Build::Expr(e)) => { concat.push(e); } |
| } |
| } |
| } |
| |
| // Called only when the parser reaches the end of input. |
| // |
| // This pops the expression stack until: |
| // |
| // 1. The stack is empty. Return concatenation of popped |
| // expressions. This concatenation may be empty! |
| // 2. An alternation is found. Pop the alternation and push |
| // a new arm. Return the alternation as the entire Regex. |
| // After this, the stack must be empty, or else there is |
| // an unclosed paren. |
| // |
| // If an opening parenthesis is popped, then an error is |
| // returned since it indicates an unclosed parenthesis. |
| fn finish_concat(&mut self) -> Result<Expr> { |
| let mut concat = vec![]; |
| loop { |
| match self.stack.pop() { |
| None => { return Ok(rev_concat(concat)); } |
| Some(Build::LeftParen{ chari, ..}) => { |
| // e.g., a(b |
| return Err(self.errat(chari, ErrorKind::UnclosedParen)); |
| } |
| Some(Build::Expr(Expr::Alternate(mut es))) => { |
| if concat.is_empty() { |
| // e.g., a| |
| return Err(self.err(ErrorKind::EmptyAlternate)); |
| } |
| es.push(rev_concat(concat)); |
| // Make sure there are no opening parens remaining. |
| match self.stack.pop() { |
| None => return Ok(Expr::Alternate(es)), |
| Some(Build::LeftParen{ chari, ..}) => { |
| // e.g., (a|b |
| return Err(self.errat( |
| chari, ErrorKind::UnclosedParen)); |
| } |
| e => unreachable!("{:?}", e), |
| } |
| } |
| Some(Build::Expr(e)) => { concat.push(e); } |
| } |
| } |
| } |
| } |
| |
| impl Build { |
| fn is_empty(&self) -> bool { |
| match *self { |
| Build::Expr(Expr::Empty) => true, |
| _ => false, |
| } |
| } |
| } |
| |
| // Make it ergonomic to conditionally bump the parser. |
| // i.e., `bump_if('a')` or `bump_if("abc")`. |
| trait Bumpable { |
| fn match_end(self, p: &Parser) -> usize; |
| } |
| |
| impl Bumpable for char { |
| fn match_end(self, p: &Parser) -> usize { |
| let mut chars = p.chars(); |
| if chars.next().map(|c| c == self).unwrap_or(false) { |
| chars.cur |
| } else { |
| 0 |
| } |
| } |
| } |
| |
| impl<'a> Bumpable for &'a str { |
| fn match_end(self, p: &Parser) -> usize { |
| let mut search = self.chars(); |
| let mut rest = p.chars(); |
| let mut count = 0; |
| loop { |
| match (rest.next(), search.next()) { |
| (Some(c1), Some(c2)) if c1 == c2 => count = rest.cur, |
| (_, None) => return count, |
| _ => return 0, |
| } |
| } |
| } |
| } |
| |
| impl<F: FnMut(char) -> bool> Bumpable for F { |
| fn match_end(mut self, p: &Parser) -> usize { |
| let mut chars = p.chars(); |
| let mut count = 0; |
| while let Some(c) = chars.next() { |
| if !self(c) { |
| break |
| } |
| count = chars.cur; |
| } |
| count |
| } |
| } |
| |
| // Turn a sequence of expressions into a concatenation. |
| // This only uses `Concat` if there are 2 or more expressions. |
| fn rev_concat(mut exprs: Vec<Expr>) -> Expr { |
| if exprs.len() == 0 { |
| Expr::Empty |
| } else if exprs.len() == 1 { |
| exprs.pop().unwrap() |
| } else { |
| exprs.reverse(); |
| Expr::Concat(exprs) |
| } |
| } |
| |
| // Returns true if and only if the given character is allowed in a capture |
| // name. Note that the first char of a capture name must not be numeric. |
| fn is_valid_capture_char(c: char) -> bool { |
| c == '_' || (c >= '0' && c <= '9') |
| || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') |
| } |
| |
| /// Returns true if the give character has significance in a regex. |
| pub fn is_punct(c: char) -> bool { |
| match c { |
| '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | |
| '[' | ']' | '{' | '}' | '^' | '$' | '#' => true, |
| _ => false, |
| } |
| } |
| |
| fn checkadd(x: usize, y: usize) -> usize { |
| x.checked_add(y).expect("regex length overflow") |
| } |
| |
| fn unicode_class(name: &str) -> Option<CharClass> { |
| UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { |
| raw_class_to_expr(UNICODE_CLASSES[i].1) |
| }) |
| } |
| |
| fn ascii_class(name: &str) -> Option<CharClass> { |
| ASCII_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { |
| raw_class_to_expr(ASCII_CLASSES[i].1) |
| }) |
| } |
| |
| fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass { |
| let range = |&(s, e)| ClassRange { start: s, end: e }; |
| CharClass::new(raw.iter().map(range).collect()) |
| } |
| |
| type Class = &'static [(char, char)]; |
| type NamedClasses = &'static [(&'static str, Class)]; |
| |
| const ASCII_CLASSES: NamedClasses = &[ |
| // Classes must be in alphabetical order so that bsearch works. |
| // [:alnum:] alphanumeric (== [0-9A-Za-z]) |
| // [:alpha:] alphabetic (== [A-Za-z]) |
| // [:ascii:] ASCII (== [\x00-\x7F]) |
| // [:blank:] blank (== [\t ]) |
| // [:cntrl:] control (== [\x00-\x1F\x7F]) |
| // [:digit:] digits (== [0-9]) |
| // [:graph:] graphical (== [!-~]) |
| // [:lower:] lower case (== [a-z]) |
| // [:print:] printable (== [ -~] == [ [:graph:]]) |
| // [:punct:] punctuation (== [!-/:-@[-`{-~]) |
| // [:space:] whitespace (== [\t\n\v\f\r ]) |
| // [:upper:] upper case (== [A-Z]) |
| // [:word:] word characters (== [0-9A-Za-z_]) |
| // [:xdigit:] hex digit (== [0-9A-Fa-f]) |
| // Taken from: http://golang.org/pkg/regex/syntax/ |
| ("alnum", &ALNUM), |
| ("alpha", &ALPHA), |
| ("ascii", &ASCII), |
| ("blank", &BLANK), |
| ("cntrl", &CNTRL), |
| ("digit", &DIGIT), |
| ("graph", &GRAPH), |
| ("lower", &LOWER), |
| ("print", &PRINT), |
| ("punct", &PUNCT), |
| ("space", &SPACE), |
| ("upper", &UPPER), |
| ("word", &WORD), |
| ("xdigit", &XDIGIT), |
| ]; |
| |
| const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; |
| const ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; |
| const ASCII: Class = &[('\x00', '\x7F')]; |
| const BLANK: Class = &[(' ', ' '), ('\t', '\t')]; |
| const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; |
| const DIGIT: Class = &[('0', '9')]; |
| const GRAPH: Class = &[('!', '~')]; |
| const LOWER: Class = &[('a', 'z')]; |
| const PRINT: Class = &[(' ', '~')]; |
| const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; |
| const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), |
| ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; |
| const UPPER: Class = &[('A', 'Z')]; |
| const WORD: Class = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')]; |
| const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; |
| |
| #[cfg(test)] |
| mod tests { |
| use { |
| CharClass, ClassRange, ByteClass, ByteRange, |
| Expr, Repeater, |
| ErrorKind, |
| }; |
| use unicode::regex::{PERLD, PERLS, PERLW}; |
| use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class}; |
| |
| static YI: &'static [(char, char)] = &[ |
| ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'), |
| ]; |
| |
| fn p(s: &str) -> Expr { Parser::parse(s, Flags::default()).unwrap() } |
| fn pf(s: &str, flags: Flags) -> Expr { Parser::parse(s, flags).unwrap() } |
| fn lit(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: false } } |
| fn liti(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: true } } |
| fn b<T>(v: T) -> Box<T> { Box::new(v) } |
| fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) } |
| |
| fn pb(s: &str) -> Expr { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| Parser::parse(s, flags).unwrap() |
| } |
| |
| fn blit(b: u8) -> Expr { |
| Expr::LiteralBytes { |
| bytes: vec![b], |
| casei: false, |
| } |
| } |
| |
| fn bliti(b: u8) -> Expr { |
| Expr::LiteralBytes { |
| bytes: vec![b], |
| casei: true, |
| } |
| } |
| |
| fn class(ranges: &[(char, char)]) -> CharClass { |
| let ranges = ranges.iter().cloned() |
| .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); |
| CharClass::new(ranges) |
| } |
| |
| fn classes(classes: &[&[(char, char)]]) -> CharClass { |
| let mut cls = CharClass::empty(); |
| for &ranges in classes { |
| cls.ranges.extend(class(ranges)); |
| } |
| cls.canonicalize() |
| } |
| |
| fn bclass(ranges: &[(u8, u8)]) -> ByteClass { |
| let ranges = ranges.iter().cloned() |
| .map(|(c1, c2)| ByteRange::new(c1, c2)).collect(); |
| ByteClass::new(ranges) |
| } |
| |
| fn asciid() -> CharClass { |
| ascii_class("digit").unwrap() |
| } |
| |
| fn asciis() -> CharClass { |
| ascii_class("space").unwrap() |
| } |
| |
| fn asciiw() -> CharClass { |
| ascii_class("word").unwrap() |
| } |
| |
| fn asciid_bytes() -> ByteClass { |
| asciid().to_byte_class() |
| } |
| |
| fn asciis_bytes() -> ByteClass { |
| asciis().to_byte_class() |
| } |
| |
| fn asciiw_bytes() -> ByteClass { |
| asciiw().to_byte_class() |
| } |
| |
| #[test] |
| fn empty() { |
| assert_eq!(p(""), Expr::Empty); |
| } |
| |
| #[test] |
| fn literal() { |
| assert_eq!(p("a"), lit('a')); |
| assert_eq!(pb("(?-u)a"), blit(b'a')); |
| } |
| |
| #[test] |
| fn literal_string() { |
| assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')])); |
| assert_eq!(pb("(?-u)ab"), Expr::Concat(vec![blit(b'a'), blit(b'b')])); |
| } |
| |
| #[test] |
| fn start_literal() { |
| assert_eq!(p("^a"), Expr::Concat(vec![ |
| Expr::StartText, |
| Expr::Literal { chars: vec!['a'], casei: false }, |
| ])); |
| } |
| |
| #[test] |
| fn repeat_zero_or_one_greedy() { |
| assert_eq!(p("a?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrOne, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_zero_or_one_greedy_concat() { |
| assert_eq!(p("ab?"), Expr::Concat(vec![ |
| lit('a'), |
| Expr::Repeat { |
| e: b(lit('b')), |
| r: Repeater::ZeroOrOne, |
| greedy: true, |
| }, |
| ])); |
| } |
| |
| #[test] |
| fn repeat_zero_or_one_nongreedy() { |
| assert_eq!(p("a??"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrOne, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_one_or_more_greedy() { |
| assert_eq!(p("a+"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::OneOrMore, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_one_or_more_nongreedy() { |
| assert_eq!(p("a+?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::OneOrMore, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_zero_or_more_greedy() { |
| assert_eq!(p("a*"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_zero_or_more_nongreedy() { |
| assert_eq!(p("a*?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_exact() { |
| assert_eq!(p("a{5}"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(5) }, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_min() { |
| assert_eq!(p("a{5,}"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: None }, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_min_max() { |
| assert_eq!(p("a{5,10}"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(10) }, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_exact_nongreedy() { |
| assert_eq!(p("a{5}?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(5) }, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_min_nongreedy() { |
| assert_eq!(p("a{5,}?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: None }, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_min_max_nongreedy() { |
| assert_eq!(p("a{5,10}?"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(10) }, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn repeat_counted_whitespace() { |
| assert_eq!(p("a{ 5 }"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(5) }, |
| greedy: true, |
| }); |
| assert_eq!(p("a{ 5 , 10 }"), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::Range { min: 5, max: Some(10) }, |
| greedy: true, |
| }); |
| } |
| |
| #[test] |
| fn group_literal() { |
| assert_eq!(p("(a)"), Expr::Group { |
| e: b(lit('a')), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn group_literal_concat() { |
| assert_eq!(p("(ab)"), Expr::Group { |
| e: b(c(&[lit('a'), lit('b')])), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn alt_two() { |
| assert_eq!(p("a|b"), Expr::Alternate(vec![lit('a'), lit('b')])); |
| } |
| |
| #[test] |
| fn alt_many() { |
| assert_eq!(p("a|b|c"), Expr::Alternate(vec![ |
| lit('a'), lit('b'), lit('c'), |
| ])); |
| } |
| |
| #[test] |
| fn alt_many_concat() { |
| assert_eq!(p("ab|bc|cd"), Expr::Alternate(vec![ |
| c(&[lit('a'), lit('b')]), |
| c(&[lit('b'), lit('c')]), |
| c(&[lit('c'), lit('d')]), |
| ])); |
| } |
| |
| #[test] |
| fn alt_group_two() { |
| assert_eq!(p("(a|b)"), Expr::Group { |
| e: b(Expr::Alternate(vec![lit('a'), lit('b')])), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn alt_group_many() { |
| assert_eq!(p("(a|b|c)"), Expr::Group { |
| e: b(Expr::Alternate(vec![lit('a'), lit('b'), lit('c')])), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn alt_group_many_concat() { |
| assert_eq!(p("(ab|bc|cd)"), Expr::Group { |
| e: b(Expr::Alternate(vec![ |
| c(&[lit('a'), lit('b')]), |
| c(&[lit('b'), lit('c')]), |
| c(&[lit('c'), lit('d')]), |
| ])), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn alt_group_nested() { |
| assert_eq!(p("(ab|(bc|(cd)))"), Expr::Group { |
| e: b(Expr::Alternate(vec![ |
| c(&[lit('a'), lit('b')]), |
| Expr::Group { |
| e: b(Expr::Alternate(vec![ |
| c(&[lit('b'), lit('c')]), |
| Expr::Group { |
| e: b(c(&[lit('c'), lit('d')])), |
| i: Some(3), |
| name: None, |
| } |
| ])), |
| i: Some(2), |
| name: None, |
| }, |
| ])), |
| i: Some(1), |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn group_name() { |
| assert_eq!(p("(?P<foo>a)"), Expr::Group { |
| e: b(lit('a')), |
| i: Some(1), |
| name: Some("foo".into()), |
| }); |
| } |
| |
| #[test] |
| fn group_no_capture() { |
| assert_eq!(p("(?:a)"), Expr::Group { |
| e: b(lit('a')), |
| i: None, |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn group_flags() { |
| assert_eq!(p("(?i:a)"), Expr::Group { |
| e: b(liti('a')), |
| i: None, |
| name: None, |
| }); |
| assert_eq!(pb("(?i-u:a)"), Expr::Group { |
| e: b(bliti(b'a')), |
| i: None, |
| name: None, |
| }); |
| } |
| |
| #[test] |
| fn group_flags_returned() { |
| assert_eq!(p("(?i:a)a"), c(&[ |
| Expr::Group { |
| e: b(liti('a')), |
| i: None, |
| name: None, |
| }, |
| lit('a'), |
| ])); |
| assert_eq!(pb("(?i-u:a)a"), c(&[ |
| Expr::Group { |
| e: b(bliti(b'a')), |
| i: None, |
| name: None, |
| }, |
| lit('a'), |
| ])); |
| } |
| |
| #[test] |
| fn group_flags_retained() { |
| assert_eq!(p("(?i)(?-i:a)a"), c(&[ |
| Expr::Group { |
| e: b(lit('a')), |
| i: None, |
| name: None, |
| }, |
| liti('a'), |
| ])); |
| assert_eq!(pb("(?i-u)(?u-i:a)a"), c(&[ |
| Expr::Group { |
| e: b(lit('a')), |
| i: None, |
| name: None, |
| }, |
| bliti(b'a'), |
| ])); |
| } |
| |
| #[test] |
| fn flags_inline() { |
| assert_eq!(p("(?i)a"), liti('a')); |
| } |
| |
| #[test] |
| fn flags_inline_multiple() { |
| assert_eq!(p("(?is)a."), c(&[liti('a'), Expr::AnyChar])); |
| } |
| |
| #[test] |
| fn flags_inline_multiline() { |
| assert_eq!(p("(?m)^(?-m)$"), c(&[Expr::StartLine, Expr::EndText])); |
| } |
| |
| #[test] |
| fn flags_inline_swap_greed() { |
| assert_eq!(p("(?U)a*a*?(?i-U)a*a*?"), c(&[ |
| Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: false, |
| }, |
| Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: true, |
| }, |
| Expr::Repeat { |
| e: b(liti('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: true, |
| }, |
| Expr::Repeat { |
| e: b(liti('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: false, |
| }, |
| ])); |
| } |
| |
| #[test] |
| fn flags_inline_multiple_negate_one() { |
| assert_eq!(p("(?is)a.(?i-s)a."), c(&[ |
| liti('a'), Expr::AnyChar, liti('a'), Expr::AnyCharNoNL, |
| ])); |
| } |
| |
| #[test] |
| fn any_byte() { |
| assert_eq!( |
| pb("(?-u).(?u)."), c(&[Expr::AnyByteNoNL, Expr::AnyCharNoNL])); |
| assert_eq!( |
| pb("(?s)(?-u).(?u)."), c(&[Expr::AnyByte, Expr::AnyChar])); |
| } |
| |
| #[test] |
| fn flags_inline_negate() { |
| assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')])); |
| } |
| |
| #[test] |
| fn flags_group_inline() { |
| assert_eq!(p("(a(?i)a)a"), c(&[ |
| Expr::Group { |
| e: b(c(&[lit('a'), liti('a')])), |
| i: Some(1), |
| name: None, |
| }, |
| lit('a'), |
| ])); |
| } |
| |
| #[test] |
| fn flags_group_inline_retain() { |
| assert_eq!(p("(?i)((?-i)a)a"), c(&[ |
| Expr::Group { |
| e: b(lit('a')), |
| i: Some(1), |
| name: None, |
| }, |
| liti('a'), |
| ])); |
| } |
| |
| #[test] |
| fn flags_default_casei() { |
| let flags = Flags { casei: true, .. Flags::default() }; |
| assert_eq!(pf("a", flags), liti('a')); |
| } |
| |
| #[test] |
| fn flags_default_multi() { |
| let flags = Flags { multi: true, .. Flags::default() }; |
| assert_eq!(pf("^", flags), Expr::StartLine); |
| } |
| |
| #[test] |
| fn flags_default_dotnl() { |
| let flags = Flags { dotnl: true, .. Flags::default() }; |
| assert_eq!(pf(".", flags), Expr::AnyChar); |
| } |
| |
| #[test] |
| fn flags_default_swap_greed() { |
| let flags = Flags { swap_greed: true, .. Flags::default() }; |
| assert_eq!(pf("a+", flags), Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::OneOrMore, |
| greedy: false, |
| }); |
| } |
| |
| #[test] |
| fn flags_default_ignore_space() { |
| let flags = Flags { ignore_space: true, .. Flags::default() }; |
| assert_eq!(pf(" a ", flags), lit('a')); |
| } |
| |
| #[test] |
| fn escape_simple() { |
| assert_eq!(p(r"\a\f\t\n\r\v"), c(&[ |
| lit('\x07'), lit('\x0C'), lit('\t'), |
| lit('\n'), lit('\r'), lit('\x0B'), |
| ])); |
| } |
| |
| #[test] |
| fn escape_boundaries() { |
| assert_eq!(p(r"\A\z\b\B"), c(&[ |
| Expr::StartText, Expr::EndText, |
| Expr::WordBoundary, Expr::NotWordBoundary, |
| ])); |
| assert_eq!(pb(r"(?-u)\b\B"), c(&[ |
| Expr::WordBoundaryAscii, Expr::NotWordBoundaryAscii, |
| ])); |
| } |
| |
| #[test] |
| fn escape_punctuation() { |
| assert_eq!(p(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[ |
| lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'), |
| lit('('), lit(')'), lit('|'), lit('['), lit(']'), |
| lit('{'), lit('}'), lit('^'), lit('$'), lit('#'), |
| ])); |
| } |
| |
| #[test] |
| fn escape_octal() { |
| assert_eq!(p(r"\123"), lit('S')); |
| assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')])); |
| |
| assert_eq!(pb(r"(?-u)\377"), blit(0xFF)); |
| } |
| |
| #[test] |
| fn escape_hex2() { |
| assert_eq!(p(r"\x53"), lit('S')); |
| assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')])); |
| |
| assert_eq!(pb(r"(?-u)\xff"), blit(0xFF)); |
| assert_eq!(pb(r"(?-u)\x00"), blit(0x0)); |
| assert_eq!(pb(r"(?-u)[\x00]"), |
| Expr::ClassBytes(bclass(&[(b'\x00', b'\x00')]))); |
| assert_eq!(pb(r"(?-u)[^\x00]"), |
| Expr::ClassBytes(bclass(&[(b'\x01', b'\xFF')]))); |
| } |
| |
| #[test] |
| fn escape_hex() { |
| assert_eq!(p(r"\x{53}"), lit('S')); |
| assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')])); |
| assert_eq!(p(r"\x{2603}"), lit('\u{2603}')); |
| |
| assert_eq!(pb(r"(?-u)\x{00FF}"), blit(0xFF)); |
| } |
| |
| #[test] |
| fn escape_unicode_name() { |
| assert_eq!(p(r"\p{Yi}"), Expr::Class(class(YI))); |
| } |
| |
| #[test] |
| fn escape_unicode_letter() { |
| assert_eq!(p(r"\pZ"), Expr::Class(class(&[ |
| ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), |
| ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), |
| ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), |
| ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), |
| ]))); |
| } |
| |
| #[test] |
| fn escape_unicode_name_case_fold() { |
| assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(YI).case_fold())); |
| } |
| |
| #[test] |
| fn escape_unicode_letter_case_fold() { |
| assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[ |
| ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), |
| ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), |
| ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), |
| ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), |
| ]).case_fold())); |
| } |
| |
| #[test] |
| fn escape_unicode_name_negate() { |
| assert_eq!(p(r"\P{Yi}"), Expr::Class(class(YI).negate())); |
| } |
| |
| #[test] |
| fn escape_unicode_letter_negate() { |
| assert_eq!(p(r"\PZ"), Expr::Class(class(&[ |
| ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), |
| ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), |
| ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), |
| ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), |
| ]).negate())); |
| } |
| |
| #[test] |
| fn escape_unicode_name_negate_case_fold() { |
| assert_eq!(p(r"(?i)\P{Yi}"), |
| Expr::Class(class(YI).negate().case_fold())); |
| } |
| |
| #[test] |
| fn escape_unicode_letter_negate_case_fold() { |
| assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[ |
| ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), |
| ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), |
| ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), |
| ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), |
| ]).negate().case_fold())); |
| } |
| |
| #[test] |
| fn escape_perl_d() { |
| assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); |
| assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid())); |
| } |
| |
| #[test] |
| fn escape_perl_s() { |
| assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); |
| assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis())); |
| } |
| |
| #[test] |
| fn escape_perl_w() { |
| assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); |
| assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw())); |
| } |
| |
| #[test] |
| fn escape_perl_d_negate() { |
| assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); |
| assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate())); |
| } |
| |
| #[test] |
| fn escape_perl_s_negate() { |
| assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); |
| assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate())); |
| } |
| |
| #[test] |
| fn escape_perl_w_negate() { |
| assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); |
| assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate())); |
| } |
| |
| #[test] |
| fn escape_perl_d_case_fold() { |
| assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); |
| assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold())); |
| } |
| |
| #[test] |
| fn escape_perl_s_case_fold() { |
| assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); |
| assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold())); |
| } |
| |
| #[test] |
| fn escape_perl_w_case_fold() { |
| assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); |
| assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold())); |
| } |
| |
| #[test] |
| fn escape_perl_d_case_fold_negate() { |
| assert_eq!(p(r"(?i)\D"), |
| Expr::Class(class(PERLD).case_fold().negate())); |
| let bytes = asciid().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes)); |
| } |
| |
| #[test] |
| fn escape_perl_s_case_fold_negate() { |
| assert_eq!(p(r"(?i)\S"), |
| Expr::Class(class(PERLS).case_fold().negate())); |
| let bytes = asciis().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes)); |
| } |
| |
| #[test] |
| fn escape_perl_w_case_fold_negate() { |
| assert_eq!(p(r"(?i)\W"), |
| Expr::Class(class(PERLW).case_fold().negate())); |
| let bytes = asciiw().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes)); |
| } |
| |
| #[test] |
| fn class_singleton() { |
| assert_eq!(p(r"[a]"), Expr::Class(class(&[('a', 'a')]))); |
| assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')]))); |
| assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')]))); |
| assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')]))); |
| |
| assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')]))); |
| assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)]))); |
| assert_eq!(pb(r"(?-u)[\xFF]"), |
| Expr::ClassBytes(bclass(&[(0xFF, 0xFF)]))); |
| assert_eq!(pb("(?-u)[\n]"), |
| Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); |
| assert_eq!(pb(r"(?-u)[\n]"), |
| Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); |
| } |
| |
| #[test] |
| fn class_singleton_negate() { |
| assert_eq!(p(r"[^a]"), Expr::Class(class(&[ |
| ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), |
| ]))); |
| assert_eq!(p(r"[^\x00]"), Expr::Class(class(&[ |
| ('\x01', '\u{10FFFF}'), |
| ]))); |
| assert_eq!(p(r"[^\n]"), Expr::Class(class(&[ |
| ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), |
| ]))); |
| assert_eq!(p("[^\n]"), Expr::Class(class(&[ |
| ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), |
| ]))); |
| |
| assert_eq!(pb(r"(?-u)[^a]"), Expr::ClassBytes(bclass(&[ |
| (0x00, 0x60), (0x62, 0xFF), |
| ]))); |
| assert_eq!(pb(r"(?-u)[^\x00]"), Expr::ClassBytes(bclass(&[ |
| (0x01, 0xFF), |
| ]))); |
| assert_eq!(pb(r"(?-u)[^\n]"), Expr::ClassBytes(bclass(&[ |
| (0x00, 0x09), (0x0B, 0xFF), |
| ]))); |
| assert_eq!(pb("(?-u)[^\n]"), Expr::ClassBytes(bclass(&[ |
| (0x00, 0x09), (0x0B, 0xFF), |
| ]))); |
| } |
| |
| #[test] |
| fn class_singleton_class() { |
| assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD))); |
| assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI))); |
| |
| let bytes = class(PERLD).to_byte_class(); |
| assert_eq!(pb(r"(?-u)[\d]"), Expr::ClassBytes(bytes)); |
| } |
| |
| #[test] |
| fn class_singleton_class_negate() { |
| assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate())); |
| assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); |
| assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); |
| |
| let bytes = asciid_bytes().negate(); |
| assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes)); |
| let bytes = asciiw_bytes().negate(); |
| assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes)); |
| let bytes = asciis_bytes().negate(); |
| assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes)); |
| } |
| |
| #[test] |
| fn class_singleton_class_negate_negate() { |
| assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD))); |
| assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); |
| assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); |
| |
| assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes())); |
| assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes())); |
| assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes())); |
| } |
| |
| #[test] |
| fn class_singleton_class_casei() { |
| assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); |
| assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); |
| |
| assert_eq!(pb(r"(?i-u)[\d]"), |
| Expr::ClassBytes(asciid_bytes().case_fold())); |
| } |
| |
| #[test] |
| fn class_singleton_class_negate_casei() { |
| assert_eq!(p(r"(?i)[^\d]"), |
| Expr::Class(class(PERLD).case_fold().negate())); |
| assert_eq!(p(r"(?i)[^\w]"), |
| Expr::Class(class(PERLW).case_fold().negate())); |
| assert_eq!(p(r"(?i)[^\s]"), |
| Expr::Class(class(PERLS).case_fold().negate())); |
| |
| let bytes = asciid_bytes().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes)); |
| let bytes = asciiw_bytes().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes)); |
| let bytes = asciis_bytes().case_fold().negate(); |
| assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes)); |
| } |
| |
| #[test] |
| fn class_singleton_class_negate_negate_casei() { |
| assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold())); |
| assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); |
| assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); |
| |
| assert_eq!(pb(r"(?i-u)[^\D]"), |
| Expr::ClassBytes(asciid_bytes().case_fold())); |
| assert_eq!(pb(r"(?i-u)[^\W]"), |
| Expr::ClassBytes(asciiw_bytes().case_fold())); |
| assert_eq!(pb(r"(?i-u)[^\S]"), |
| Expr::ClassBytes(asciis_bytes().case_fold())); |
| } |
| |
| #[test] |
| fn class_multiple_class() { |
| assert_eq!(p(r"[\d\p{Yi}]"), Expr::Class(classes(&[ |
| PERLD, YI, |
| ]))); |
| } |
| |
| #[test] |
| fn class_multiple_class_negate() { |
| assert_eq!(p(r"[^\d\p{Yi}]"), Expr::Class(classes(&[ |
| PERLD, YI, |
| ]).negate())); |
| } |
| |
| #[test] |
| fn class_multiple_class_negate_negate() { |
| let nperlw = class(PERLW).negate(); |
| let nyi = class(YI).negate(); |
| let cls = CharClass::empty().merge(nperlw).merge(nyi); |
| assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate())); |
| } |
| |
| #[test] |
| fn class_multiple_class_casei() { |
| assert_eq!(p(r"(?i)[\d\p{Yi}]"), Expr::Class(classes(&[ |
| PERLD, YI, |
| ]).case_fold())); |
| } |
| |
| #[test] |
| fn class_multiple_class_negate_casei() { |
| assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[ |
| PERLD, YI, |
| ]).case_fold().negate())); |
| } |
| |
| #[test] |
| fn class_multiple_class_negate_negate_casei() { |
| let nperlw = class(PERLW).negate(); |
| let nyi = class(YI).negate(); |
| let class = CharClass::empty().merge(nperlw).merge(nyi); |
| assert_eq!(p(r"(?i)[^\W\P{Yi}]"), |
| Expr::Class(class.case_fold().negate())); |
| } |
| |
| #[test] |
| fn class_class_hypen() { |
| assert_eq!(p(r"[\p{Yi}-]"), Expr::Class(classes(&[ |
| &[('-', '-')], YI, |
| ]))); |
| assert_eq!(p(r"[\p{Yi}-a]"), Expr::Class(classes(&[ |
| &[('-', '-')], &[('a', 'a')], YI, |
| ]))); |
| } |
| |
| #[test] |
| fn class_brackets() { |
| assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')]))); |
| assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')]))); |
| assert_eq!(p("[[]]"), Expr::Concat(vec![ |
| Expr::Class(class(&[('[', '[')])), |
| lit(']'), |
| ])); |
| } |
| |
| #[test] |
| fn class_brackets_hypen() { |
| assert_eq!(p("[]-]"), Expr::Class(class(&[('-', '-'), (']', ']')]))); |
| assert_eq!(p("[-]]"), Expr::Concat(vec![ |
| Expr::Class(class(&[('-', '-')])), |
| lit(']'), |
| ])); |
| } |
| |
| #[test] |
| fn class_overlapping() { |
| assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')]))); |
| assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')]))); |
| |
| assert_eq!(pb("(?-u)[a-fd-h]"), |
| Expr::ClassBytes(bclass(&[(b'a', b'h')]))); |
| assert_eq!(pb("(?-u)[a-fg-m]"), |
| Expr::ClassBytes(bclass(&[(b'a', b'm')]))); |
| } |
| |
| #[test] |
| fn ascii_classes() { |
| assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); |
| assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); |
| |
| assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER))); |
| assert_eq!(pb("(?-u)[[:upper:]]"), |
| Expr::ClassBytes(class(UPPER).to_byte_class())); |
| } |
| |
| #[test] |
| fn ascii_classes_not() { |
| assert_eq!(p("[:abc:]"), |
| Expr::Class(class(&[(':', ':'), ('a', 'c')]))); |
| assert_eq!(pb("(?-u)[:abc:]"), |
| Expr::ClassBytes(bclass(&[(b':', b':'), (b'a', b'c')]))); |
| } |
| |
| #[test] |
| fn ascii_classes_multiple() { |
| assert_eq!(p("[[:lower:][:upper:]]"), |
| Expr::Class(classes(&[UPPER, LOWER]))); |
| |
| assert_eq!(pb("(?-u)[[:lower:][:upper:]]"), |
| Expr::ClassBytes(classes(&[UPPER, LOWER]).to_byte_class())); |
| } |
| |
| #[test] |
| fn ascii_classes_negate() { |
| assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate())); |
| assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER))); |
| |
| assert_eq!(pb("(?-u)[[:^upper:]]"), |
| Expr::ClassBytes(class(UPPER).to_byte_class().negate())); |
| assert_eq!(pb("(?-u)[^[:^upper:]]"), |
| Expr::ClassBytes(class(UPPER).to_byte_class())); |
| } |
| |
| #[test] |
| fn ascii_classes_negate_multiple() { |
| let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate()); |
| let cls = CharClass::empty().merge(nlower).merge(nword); |
| assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone())); |
| assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate())); |
| } |
| |
| #[test] |
| fn ascii_classes_case_fold() { |
| assert_eq!(p("(?i)[:upper:]"), Expr::Class(class(UPPER).case_fold())); |
| assert_eq!(p("(?i)[[:upper:]]"), |
| Expr::Class(class(UPPER).case_fold())); |
| |
| assert_eq!(pb("(?i-u)[:upper:]"), |
| Expr::Class(class(UPPER).case_fold())); |
| assert_eq!(pb("(?i-u)[[:upper:]]"), |
| Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); |
| } |
| |
| #[test] |
| fn ascii_classes_negate_case_fold() { |
| assert_eq!(p("(?i)[[:^upper:]]"), |
| Expr::Class(class(UPPER).case_fold().negate())); |
| assert_eq!(p("(?i)[^[:^upper:]]"), |
| Expr::Class(class(UPPER).case_fold())); |
| |
| assert_eq!(pb("(?i-u)[[:^upper:]]"), |
| Expr::ClassBytes( |
| class(UPPER).to_byte_class().case_fold().negate())); |
| assert_eq!(pb("(?i-u)[^[:^upper:]]"), |
| Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); |
| } |
| |
| #[test] |
| fn single_class_negate_case_fold() { |
| assert_eq!(p("(?i)[^x]"), |
| Expr::Class(class(&[('x', 'x')]).case_fold().negate())); |
| |
| assert_eq!(pb("(?i-u)[^x]"), |
| Expr::ClassBytes( |
| class(&[('x', 'x')]) |
| .to_byte_class().case_fold().negate())); |
| } |
| |
| #[test] |
| fn ignore_space_empty() { |
| assert_eq!(p("(?x) "), Expr::Empty); |
| } |
| |
| #[test] |
| fn ignore_space_literal() { |
| assert_eq!(p("(?x) a b c"), Expr::Concat(vec![ |
| lit('a'), lit('b'), lit('c'), |
| ])); |
| } |
| |
| #[test] |
| fn ignore_space_literal_off() { |
| assert_eq!(p("(?x) a b c(?-x) a"), Expr::Concat(vec![ |
| lit('a'), lit('b'), lit('c'), lit(' '), lit('a'), |
| ])); |
| } |
| |
| #[test] |
| fn ignore_space_class() { |
| assert_eq!(p("(?x)[a |
| - z |
| ]"), Expr::Class(class(&[('a', 'z')]))); |
| assert_eq!(p("(?x)[ ^ a |
| - z |
| ]"), Expr::Class(class(&[('a', 'z')]).negate())); |
| } |
| |
| #[test] |
| fn ignore_space_escape() { |
| assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD))); |
| assert_eq!(p(r"(?x)\ |
| D"), Expr::Class(class(PERLD).negate())); |
| } |
| |
| #[test] |
| fn ignore_space_comments() { |
| assert_eq!(p(r"(?x)(?P<foo> |
| a # comment 1 |
| )(?P<bar> |
| z # comment 2 |
| )"), Expr::Concat(vec![ |
| Expr::Group { |
| e: Box::new(lit('a')), |
| i: Some(1), |
| name: Some("foo".into()), |
| }, |
| Expr::Group { |
| e: Box::new(lit('z')), |
| i: Some(2), |
| name: Some("bar".into()), |
| }, |
| ])); |
| } |
| |
| #[test] |
| fn ignore_space_comments_re_enable() { |
| assert_eq!(p(r"(?x)a # hi |
| (?-x:#) # sweet"), Expr::Concat(vec![ |
| lit('a'), |
| Expr::Group { |
| e: Box::new(lit('#')), |
| i: None, |
| name: None, |
| }, |
| ])); |
| } |
| |
| #[test] |
| fn ignore_space_escape_punctuation() { |
| assert_eq!(p(r"(?x)\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[ |
| lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'), |
| lit('('), lit(')'), lit('|'), lit('['), lit(']'), |
| lit('{'), lit('}'), lit('^'), lit('$'), lit('#'), |
| ])); |
| } |
| |
| #[test] |
| fn ignore_space_escape_hash() { |
| assert_eq!(p(r"(?x)a\# # hi there"), Expr::Concat(vec![ |
| lit('a'), |
| lit('#'), |
| ])); |
| } |
| |
| // Test every single possible error case. |
| |
| macro_rules! test_err { |
| ($re:expr, $pos:expr, $kind:expr) => { |
| test_err!($re, $pos, $kind, Flags::default()); |
| }; |
| ($re:expr, $pos:expr, $kind:expr, $flags:expr) => {{ |
| let err = Parser::parse($re, $flags).unwrap_err(); |
| assert_eq!($pos, err.pos); |
| assert_eq!($kind, err.kind); |
| assert!($re.contains(&err.surround)); |
| }} |
| } |
| |
| #[test] |
| fn invalid_utf8_not_allowed() { |
| // let flags = Flags { unicode: false, .. Flags::default() }; |
| test_err!(r"(?-u)\xFF", 9, ErrorKind::InvalidUtf8); |
| test_err!(r"(?-u).", 5, ErrorKind::InvalidUtf8); |
| test_err!(r"(?-u)(?s).", 9, ErrorKind::InvalidUtf8); |
| test_err!(r"(?-u)[\x00-\x80]", 15, ErrorKind::InvalidUtf8); |
| test_err!(r"(?-u)\222", 9, ErrorKind::InvalidUtf8); |
| test_err!(r"(?-u)\x{0080}", 13, ErrorKind::InvalidUtf8); |
| } |
| |
| #[test] |
| fn unicode_char_not_allowed() { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| test_err!("☃(?-u:☃)", 7, ErrorKind::UnicodeNotAllowed, flags); |
| } |
| |
| #[test] |
| fn unicode_class_not_allowed() { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags); |
| } |
| |
| #[test] |
| fn unicode_class_literal_not_allowed() { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags); |
| test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags); |
| } |
| |
| #[test] |
| fn unicode_hex_not_allowed() { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| test_err!(r"(?-u)\x{FFFF}", 13, ErrorKind::UnicodeNotAllowed, flags); |
| test_err!(r"(?-u)\x{100}", 12, ErrorKind::UnicodeNotAllowed, flags); |
| } |
| |
| #[test] |
| fn unicode_octal_not_allowed() { |
| let flags = Flags { allow_bytes: true, .. Flags::default() }; |
| test_err!(r"(?-u)\400", 9, ErrorKind::UnicodeNotAllowed, flags); |
| } |
| |
| #[test] |
| fn error_repeat_no_expr_simple() { |
| test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr); |
| } |
| |
| #[test] |
| fn error_repeat_no_expr_counted() { |
| test_err!("({5}", 1, ErrorKind::RepeaterExpectsExpr); |
| } |
| |
| #[test] |
| fn error_repeat_beginning_counted() { |
| test_err!("{5}", 0, ErrorKind::RepeaterExpectsExpr); |
| } |
| |
| #[test] |
| fn error_repeat_illegal_exprs_simple() { |
| test_err!("a**", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: true, |
| })); |
| test_err!("a|*", 2, |
| ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) |
| ); |
| } |
| |
| #[test] |
| fn error_repeat_illegal_exprs_counted() { |
| test_err!("a*{5}", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { |
| e: b(lit('a')), |
| r: Repeater::ZeroOrMore, |
| greedy: true, |
| })); |
| test_err!("a|{5}", 2, |
| ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) |
| ); |
| } |
| |
| #[test] |
| fn error_repeat_empty_number() { |
| test_err!("a{}", 2, ErrorKind::MissingBase10); |
| } |
| |
| #[test] |
| fn error_repeat_eof() { |
| test_err!("a{5", 3, ErrorKind::UnclosedRepeat); |
| } |
| |
| #[test] |
| fn error_repeat_empty_number_eof() { |
| test_err!("a{xyz", 5, ErrorKind::InvalidBase10("xyz".into())); |
| test_err!("a{12,xyz", 8, ErrorKind::InvalidBase10("xyz".into())); |
| } |
| |
| #[test] |
| fn error_repeat_invalid_number() { |
| test_err!("a{9999999999}", 12, |
| ErrorKind::InvalidBase10("9999999999".into())); |
| test_err!("a{1,9999999999}", 14, |
| ErrorKind::InvalidBase10("9999999999".into())); |
| } |
| |
| #[test] |
| fn error_repeat_invalid_number_extra() { |
| test_err!("a{12x}", 5, ErrorKind::InvalidBase10("12x".into())); |
| test_err!("a{1,12x}", 7, ErrorKind::InvalidBase10("12x".into())); |
| } |
| |
| #[test] |
| fn error_repeat_invalid_range() { |
| test_err!("a{2,1}", 5, |
| ErrorKind::InvalidRepeatRange { min: 2, max: 1 }); |
| } |
| |
| #[test] |
| fn error_alternate_empty() { |
| test_err!("|a", 0, ErrorKind::EmptyAlternate); |
| } |
| |
| #[test] |
| fn error_alternate_empty_with_group() { |
| test_err!("(|a)", 1, ErrorKind::EmptyAlternate); |
| } |
| |
| #[test] |
| fn error_alternate_empty_with_alternate() { |
| test_err!("a||", 2, ErrorKind::EmptyAlternate); |
| } |
| |
| #[test] |
| fn error_close_paren_unopened_empty() { |
| test_err!(")", 0, ErrorKind::UnopenedParen); |
| } |
| |
| #[test] |
| fn error_close_paren_unopened() { |
| test_err!("ab)", 2, ErrorKind::UnopenedParen); |
| } |
| |
| #[test] |
| fn error_close_paren_unopened_with_alt() { |
| test_err!("a|b)", 3, ErrorKind::UnopenedParen); |
| } |
| |
| #[test] |
| fn error_close_paren_unclosed_with_alt() { |
| test_err!("(a|b", 0, ErrorKind::UnclosedParen); |
| } |
| |
| #[test] |
| fn error_close_paren_empty_alt() { |
| test_err!("(a|)", 3, ErrorKind::EmptyAlternate); |
| } |
| |
| #[test] |
| fn error_close_paren_empty_group() { |
| test_err!("()", 1, ErrorKind::EmptyGroup); |
| } |
| |
| #[test] |
| fn error_close_paren_empty_group_with_name() { |
| test_err!("(?P<foo>)", 8, ErrorKind::EmptyGroup); |
| } |
| |
| #[test] |
| fn error_finish_concat_unclosed() { |
| test_err!("ab(xy", 2, ErrorKind::UnclosedParen); |
| } |
| |
| #[test] |
| fn error_finish_concat_empty_alt() { |
| test_err!("a|", 2, ErrorKind::EmptyAlternate); |
| } |
| |
| #[test] |
| fn error_group_name_invalid() { |
| test_err!("(?P<a#>x)", 6, ErrorKind::InvalidCaptureName("a#".into())); |
| } |
| |
| #[test] |
| fn error_group_name_invalid_leading() { |
| test_err!("(?P<1a>a)", 6, ErrorKind::InvalidCaptureName("1a".into())); |
| } |
| |
| #[test] |
| fn error_group_name_unexpected_eof() { |
| test_err!("(?P<a", 5, ErrorKind::UnclosedCaptureName("a".into())); |
| } |
| |
| #[test] |
| fn error_group_name_empty() { |
| test_err!("(?P<>a)", 4, ErrorKind::EmptyCaptureName); |
| } |
| |
| #[test] |
| fn error_group_opts_unrecognized_flag() { |
| test_err!("(?z:a)", 2, ErrorKind::UnrecognizedFlag('z')); |
| } |
| |
| #[test] |
| fn error_group_opts_unexpected_eof() { |
| test_err!("(?i", 3, ErrorKind::UnexpectedFlagEof); |
| } |
| |
| #[test] |
| fn error_group_opts_double_negation() { |
| test_err!("(?-i-s:a)", 4, ErrorKind::DoubleFlagNegation); |
| } |
| |
| #[test] |
| fn error_group_opts_empty_negation() { |
| test_err!("(?i-:a)", 4, ErrorKind::EmptyFlagNegation); |
| } |
| |
| #[test] |
| fn error_group_opts_empty() { |
| test_err!("(?)", 2, ErrorKind::EmptyFlagNegation); |
| } |
| |
| #[test] |
| fn error_escape_unexpected_eof() { |
| test_err!(r"\", 1, ErrorKind::UnexpectedEscapeEof); |
| } |
| |
| #[test] |
| fn error_escape_unrecognized() { |
| test_err!(r"\m", 1, ErrorKind::UnrecognizedEscape('m')); |
| } |
| |
| #[test] |
| fn error_escape_hex2_eof0() { |
| test_err!(r"\x", 2, ErrorKind::UnexpectedTwoDigitHexEof); |
| } |
| |
| #[test] |
| fn error_escape_hex2_eof1() { |
| test_err!(r"\xA", 3, ErrorKind::UnexpectedTwoDigitHexEof); |
| } |
| |
| #[test] |
| fn error_escape_hex2_invalid() { |
| test_err!(r"\xAG", 4, ErrorKind::InvalidBase16("AG".into())); |
| } |
| |
| #[test] |
| fn error_escape_hex_eof0() { |
| test_err!(r"\x{", 3, ErrorKind::InvalidBase16("".into())); |
| } |
| |
| #[test] |
| fn error_escape_hex_eof1() { |
| test_err!(r"\x{A", 4, ErrorKind::UnclosedHex); |
| } |
| |
| #[test] |
| fn error_escape_hex_invalid() { |
| test_err!(r"\x{AG}", 5, ErrorKind::InvalidBase16("AG".into())); |
| } |
| |
| #[test] |
| fn error_escape_hex_invalid_scalar_value_surrogate() { |
| test_err!(r"\x{D800}", 8, ErrorKind::InvalidScalarValue(0xD800)); |
| } |
| |
| #[test] |
| fn error_escape_hex_invalid_scalar_value_high() { |
| test_err!(r"\x{110000}", 10, ErrorKind::InvalidScalarValue(0x110000)); |
| } |
| |
| #[test] |
| fn error_escape_hex_invalid_u32() { |
| test_err!(r"\x{9999999999}", 13, |
| ErrorKind::InvalidBase16("9999999999".into())); |
| } |
| |
| #[test] |
| fn error_unicode_unclosed() { |
| test_err!(r"\p{", 3, ErrorKind::UnclosedUnicodeName); |
| test_err!(r"\p{Greek", 8, ErrorKind::UnclosedUnicodeName); |
| } |
| |
| #[test] |
| fn error_unicode_no_letter() { |
| test_err!(r"\p", 2, ErrorKind::UnexpectedEscapeEof); |
| } |
| |
| #[test] |
| fn error_unicode_unknown_letter() { |
| test_err!(r"\pA", 3, ErrorKind::UnrecognizedUnicodeClass("A".into())); |
| } |
| |
| #[test] |
| fn error_unicode_unknown_name() { |
| test_err!(r"\p{Yii}", 7, |
| ErrorKind::UnrecognizedUnicodeClass("Yii".into())); |
| } |
| |
| #[test] |
| fn error_class_eof_empty() { |
| test_err!("[", 1, ErrorKind::UnexpectedClassEof); |
| test_err!("[^", 2, ErrorKind::UnexpectedClassEof); |
| } |
| |
| #[test] |
| fn error_class_eof_non_empty() { |
| test_err!("[a", 2, ErrorKind::UnexpectedClassEof); |
| test_err!("[^a", 3, ErrorKind::UnexpectedClassEof); |
| } |
| |
| #[test] |
| fn error_class_eof_range() { |
| test_err!("[a-", 3, ErrorKind::UnexpectedClassEof); |
| test_err!("[^a-", 4, ErrorKind::UnexpectedClassEof); |
| test_err!("[---", 4, ErrorKind::UnexpectedClassEof); |
| } |
| |
| #[test] |
| fn error_class_invalid_escape() { |
| test_err!(r"[\pA]", 4, |
| ErrorKind::UnrecognizedUnicodeClass("A".into())); |
| } |
| |
| #[test] |
| fn error_class_valid_escape_not_allowed() { |
| test_err!(r"[\A]", 3, ErrorKind::InvalidClassEscape(Expr::StartText)); |
| } |
| |
| #[test] |
| fn error_class_range_valid_escape_not_allowed() { |
| test_err!(r"[a-\d]", 5, |
| ErrorKind::InvalidClassEscape(Expr::Class(class(PERLD)))); |
| test_err!(r"[a-\A]", 5, |
| ErrorKind::InvalidClassEscape(Expr::StartText)); |
| test_err!(r"[\A-a]", 3, |
| ErrorKind::InvalidClassEscape(Expr::StartText)); |
| } |
| |
| #[test] |
| fn error_class_invalid_range() { |
| test_err!("[z-a]", 4, ErrorKind::InvalidClassRange { |
| start: 'z', |
| end: 'a', |
| }); |
| } |
| |
| #[test] |
| fn error_class_empty_range() { |
| test_err!("[]", 2, ErrorKind::UnexpectedClassEof); |
| test_err!("[^]", 3, ErrorKind::UnexpectedClassEof); |
| test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass); |
| } |
| |
| #[test] |
| fn error_duplicate_capture_name() { |
| test_err!("(?P<a>.)(?P<a>.)", 14, |
| ErrorKind::DuplicateCaptureName("a".into())); |
| } |
| } |