vendor/regex-0.1.80/src/lib.rs - toolchain/rustc - Git at Google

 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 //! This crate provides a native implementation of regular expressions that is
 //! heavily based on RE2 both in syntax and in implementation. Notably,
 //! backreferences and arbitrary lookahead/lookbehind assertions are not
 //! provided. In return, regular expression searching provided by this package
 //! has excellent worst-case performance. The specific syntax supported is
 //! documented further down.
 //!
 //! This crate's documentation provides some simple examples, describes Unicode
 //! support and exhaustively lists the supported syntax. For more specific
 //! details on the API, please see the documentation for the
 //! [`Regex`](struct.Regex.html) type.
 //!
 //! # Usage
 //!
 //! This crate is [on crates.io](https://crates.io/crates/regex) and can be
 //! used by adding `regex` to your dependencies in your project's `Cargo.toml`.
 //!
 //! ```toml
 //! [dependencies]
 //! regex = "0.1"
 //! ```
 //!
 //! and this to your crate root:
 //!
 //! ```rust
 //! extern crate regex;
 //! ```
 //!
 //! # Example: find a date
 //!
 //! General use of regular expressions in this package involves compiling an
 //! expression and then using it to search, split or replace text. For example,
 //! to confirm that some text resembles a date:
 //!
 //! ```rust
 //! use regex::Regex;
 //! let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
 //! assert!(re.is_match("2014-01-01"));
 //! ```
 //!
 //! Notice the use of the `^` and `$` anchors. In this crate, every expression
 //! is executed with an implicit `.*?` at the beginning and end, which allows
 //! it to match anywhere in the text. Anchors can be used to ensure that the
 //! full text matches an expression.
 //!
 //! This example also demonstrates the utility of
 //! [raw strings](https://doc.rust-lang.org/stable/reference.html#raw-string-literals)
 //! in Rust, which
 //! are just like regular strings except they are prefixed with an `r` and do
 //! not process any escape sequences. For example, `"\\d"` is the same
 //! expression as `r"\d"`.
 //!
 //! # Example: Avoid compiling the same regex in a loop
 //!
 //! It is an anti-pattern to compile the same regular expression in a loop
 //! since compilation is typically expensive. (It takes anywhere from a few
 //! microseconds to a few **milliseconds** depending on the size of the
 //! regex.) Not only is compilation itself expensive, but this also prevents
 //! optimizations that reuse allocations internally to the matching engines.
 //!
 //! In Rust, it can sometimes be a pain to pass regular expressions around if
 //! they're used from inside a helper function. Instead, we recommend using the
 //! [`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
 //! regular expressions are compiled exactly once.
 //!
 //! For example:
 //!
 //! ```rust
 //! #[macro_use] extern crate lazy_static;
 //! extern crate regex;
 //!
 //! use regex::Regex;
 //!
 //! fn some_helper_function(text: &str) -> bool {
 //!     lazy_static! {
 //!         static ref RE: Regex = Regex::new("...").unwrap();
 //!     }
 //!     RE.is_match(text)
 //! }
 //!
 //! fn main() {}
 //! ```
 //!
 //! Specifically, in this example, the regex will be compiled when it is used for
 //! the first time. On subsequent uses, it will reuse the previous compilation.
 //!
 //! # Example: iterating over capture groups
 //!
 //! This crate provides convenient iterators for matching an expression
 //! repeatedly against a search string to find successive non-overlapping
 //! matches. For example, to find all dates in a string and be able to access
 //! them by their component pieces:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
 //! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
 //! for cap in re.captures_iter(text) {
 //!     println!("Month: {} Day: {} Year: {}",
 //!              cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""),
 //!              cap.at(1).unwrap_or(""));
 //! }
 //! // Output:
 //! // Month: 03 Day: 14 Year: 2012
 //! // Month: 01 Day: 01 Year: 2013
 //! // Month: 07 Day: 05 Year: 2014
 //! # }
 //! ```
 //!
 //! Notice that the year is in the capture group indexed at `1`. This is
 //! because the *entire match* is stored in the capture group at index `0`.
 //!
 //! # Example: replacement with named capture groups
 //!
 //! Building on the previous example, perhaps we'd like to rearrange the date
 //! formats. This can be done with text replacement. But to make the code
 //! clearer, we can *name*  our capture groups and use those names as variables
 //! in our replacement text:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
 //! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
 //! let after = re.replace_all(before, "$m/$d/$y");
 //! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
 //! # }
 //! ```
 //!
 //! The `replace` methods are actually polymorphic in the replacement, which
 //! provides more flexibility than is seen here. (See the documentation for
 //! `Regex::replace` for more details.)
 //!
 //! Note that if your regex gets complicated, you can use the `x` flag to
 //! enable insigificant whitespace mode, which also lets you write comments:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(?x)
 //!   (?P<y>\d{4}) # the year
 //!   -
 //!   (?P<m>\d{2}) # the month
 //!   -
 //!   (?P<d>\d{2}) # the day
 //! ").unwrap();
 //! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
 //! let after = re.replace_all(before, "$m/$d/$y");
 //! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
 //! # }
 //! ```
 //!
 //! # Example: match multiple regular expressions simultaneously
 //!
 //! This demonstrates how to use a `RegexSet` to match multiple (possibly
 //! overlapping) regular expressions in a single scan of the search text:
 //!
 //! ```rust
 //! use regex::RegexSet;
 //!
 //! let set = RegexSet::new(&[
 //!     r"\w+",
 //!     r"\d+",
 //!     r"\pL+",
 //!     r"foo",
 //!     r"bar",
 //!     r"barfoo",
 //!     r"foobar",
 //! ]).unwrap();
 //!
 //! // Iterate over and collect all of the matches.
 //! let matches: Vec<_> = set.matches("foobar").into_iter().collect();
 //! assert_eq!(matches, vec![0, 2, 3, 4, 6]);
 //!
 //! // You can also test whether a particular regex matched:
 //! let matches = set.matches("foobar");
 //! assert!(!matches.matched(5));
 //! assert!(matches.matched(6));
 //! ```
 //!
 //! # Pay for what you use
 //!
 //! With respect to searching text with a regular expression, there are three
 //! questions that can be asked:
 //!
 //! 1. Does the text match this expression?
 //! 2. If so, where does it match?
 //! 3. Where are the submatches?
 //!
 //! Generally speaking, this crate could provide a function to answer only #3,
 //! which would subsume #1 and #2 automatically. However, it can be
 //! significantly more expensive to compute the location of submatches, so it's
 //! best not to do it if you don't need to.
 //!
 //! Therefore, only use what you need. For example, don't use `find` if you
 //! only need to test if an expression matches a string. (Use `is_match`
 //! instead.)
 //!
 //! # Unicode
 //!
 //! This implementation executes regular expressions **only** on valid UTF-8
 //! while exposing match locations as byte indices into the search string.
 //!
 //! Only simple case folding is supported. Namely, when matching
 //! case-insensitively, the characters are first mapped using the [simple case
 //! folding](ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt) mapping
 //! before matching.
 //!
 //! Regular expressions themselves are **only** interpreted as a sequence of
 //! Unicode scalar values. This means you can use Unicode characters directly
 //! in your expression:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(?i)Δ+").unwrap();
 //! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
 //! # }
 //! ```
 //!
 //! Finally, Unicode general categories and scripts are available as character
 //! classes. For example, you can match a sequence of numerals, Greek or
 //! Cherokee letters:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
 //! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23)));
 //! # }
 //! ```
 //!
 //! # Opt out of Unicode support
 //!
 //! The `bytes` sub-module provides a `Regex` type that can be used to match
 //! on `&[u8]`. By default, text is interpreted as ASCII compatible text with
 //! all Unicode support disabled (e.g., `.` matches any byte instead of any
 //! Unicode codepoint). Unicode support can be selectively enabled with the
 //! `u` flag. See the `bytes` module documentation for more details.
 //!
 //! Unicode support can also be selectively *disabled* with the main `Regex`
 //! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII
 //! word boundary. Note though that invalid UTF-8 is not allowed to be matched
 //! even when the `u` flag is disabled. For example, `(?-u:.)` will return an
 //! error, since `.` matches *any byte* when Unicode support is disabled.
 //!
 //! # Syntax
 //!
 //! The syntax supported in this crate is almost in an exact correspondence
 //! with the syntax supported by RE2. It is documented below.
 //!
 //! Note that the regular expression parser and abstract syntax are exposed in
 //! a separate crate, [`regex-syntax`](../regex_syntax/index.html).
 //!
 //! ## Matching one character
 //!
 //! <pre class="rust">
 //! .           any character except new line (includes new line with s flag)
 //! [xyz]       A character class matching either x, y or z.
 //! [^xyz]      A character class matching any character except x, y and z.
 //! [a-z]       A character class matching any character in range a-z.
 //! \d          digit (\p{Nd})
 //! \D          not digit
 //! [:alpha:]   ASCII character class ([A-Za-z])
 //! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
 //! \pN         One-letter name Unicode character class
 //! \p{Greek}   Unicode character class (general category or script)
 //! \PN         Negated one-letter name Unicode character class
 //! \P{Greek}   negated Unicode character class (general category or script)
 //! </pre>
 //!
 //! Any named character class may appear inside a bracketed `[...]` character
 //! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
 //! character.
 //!
 //! ## Composites
 //!
 //! <pre class="rust">
 //! xy    concatenation (x followed by y)
 //! x|y   alternation (x or y, prefer x)
 //! </pre>
 //!
 //! ## Repetitions
 //!
 //! <pre class="rust">
 //! x*        zero or more of x (greedy)
 //! x+        one or more of x (greedy)
 //! x?        zero or one of x (greedy)
 //! x*?       zero or more of x (ungreedy/lazy)
 //! x+?       one or more of x (ungreedy/lazy)
 //! x??       zero or one of x (ungreedy/lazy)
 //! x{n,m}    at least n x and at most m x (greedy)
 //! x{n,}     at least n x (greedy)
 //! x{n}      exactly n x
 //! x{n,m}?   at least n x and at most m x (ungreedy/lazy)
 //! x{n,}?    at least n x (ungreedy/lazy)
 //! x{n}?     exactly n x
 //! </pre>
 //!
 //! ## Empty matches
 //!
 //! <pre class="rust">
 //! ^     the beginning of text (or start-of-line with multi-line mode)
 //! $     the end of text (or end-of-line with multi-line mode)
 //! \A    only the beginning of text (even with multi-line mode enabled)
 //! \z    only the end of text (even with multi-line mode enabled)
 //! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
 //! \B    not a Unicode word boundary
 //! </pre>
 //!
 //! ## Grouping and flags
 //!
 //! <pre class="rust">
 //! (exp)          numbered capture group (indexed by opening parenthesis)
 //! (?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
 //! (?:exp)        non-capturing group
 //! (?flags)       set flags within current group
 //! (?flags:exp)   set flags for exp (non-capturing)
 //! </pre>
 //!
 //! Flags are each a single character. For example, `(?x)` sets the flag `x`
 //! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
 //! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
 //! the `x` flag and clears the `y` flag.
 //!
 //! All flags are by default disabled unless stated otherwise. They are:
 //!
 //! <pre class="rust">
 //! i     case-insensitive
 //! m     multi-line mode: ^ and $ match begin/end of line
 //! s     allow . to match \n
 //! U     swap the meaning of x* and x*?
 //! u     Unicode support (enabled by default)
 //! x     ignore whitespace and allow line comments (starting with `#`)
 //! </pre>
 //!
 //! Here's an example that matches case-insensitively for only part of the
 //! expression:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
 //! let cap = re.captures("AaAaAbbBBBb").unwrap();
 //! assert_eq!(cap.at(0), Some("AaAaAbb"));
 //! # }
 //! ```
 //!
 //! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
 //! `b`.
 //!
 //! Here is an example that uses an ASCII word boundary instead of a Unicode
 //! word boundary:
 //!
 //! ```rust
 //! # extern crate regex; use regex::Regex;
 //! # fn main() {
 //! let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
 //! let cap = re.captures("$$abc$$").unwrap();
 //! assert_eq!(cap.at(0), Some("abc"));
 //! # }
 //! ```
 //!
 //! ## Escape sequences
 //!
 //! <pre class="rust">
 //! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
 //! \a         bell (\x07)
 //! \f         form feed (\x0C)
 //! \t         horizontal tab
 //! \n         new line
 //! \r         carriage return
 //! \v         vertical tab (\x0B)
 //! \123       octal character code (up to three digits)
 //! \x7F       hex character code (exactly two digits)
 //! \x{10FFFF} any hex character code corresponding to a Unicode code point
 //! </pre>
 //!
 //! ## Perl character classes (Unicode friendly)
 //!
 //! These classes are based on the definitions provided in
 //! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
 //!
 //! <pre class="rust">
 //! \d     digit (\p{Nd})
 //! \D     not digit
 //! \s     whitespace (\p{White_Space})
 //! \S     not whitespace
 //! \w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
 //! \W     not word character
 //! </pre>
 //!
 //! ## ASCII character classes
 //!
 //! <pre class="rust">
 //! [:alnum:]    alphanumeric ([0-9A-Za-z])
 //! [:alpha:]    alphabetic ([A-Za-z])
 //! [:ascii:]    ASCII ([\x00-\x7F])
 //! [:blank:]    blank ([\t ])
 //! [:cntrl:]    control ([\x00-\x1F\x7F])
 //! [:digit:]    digits ([0-9])
 //! [:graph:]    graphical ([!-~])
 //! [:lower:]    lower case ([a-z])
 //! [:print:]    printable ([ -~])
 //! [:punct:]    punctuation ([!-/:-@[-`{-~])
 //! [:space:]    whitespace ([\t\n\v\f\r ])
 //! [:upper:]    upper case ([A-Z])
 //! [:word:]     word characters ([0-9A-Za-z_])
 //! [:xdigit:]   hex digit ([0-9A-Fa-f])
 //! </pre>
 //!
 //! # Untrusted input
 //!
 //! This crate can handle both untrusted regular expressions and untrusted
 //! search text.
 //!
 //! Untrusted regular expressions are handled by capping the size of a compiled
 //! regular expression. (See `Regex::with_size_limit`.) Without this, it would
 //! be trivial for an attacker to exhaust your system's memory with expressions
 //! like `a{100}{100}{100}`.
 //!
 //! Untrusted search text is allowed because the matching engine(s) in this
 //! crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search
 //! text`), which means there's no way to cause exponential blow-up like with
 //! some other regular expression engines. (We pay for this by disallowing
 //! features like arbitrary look-ahead and backreferences.)
 //!
 //! When a DFA is used, pathological cases with exponential state blow up are
 //! avoided by constructing the DFA lazily or in an "online" manner. Therefore,
 //! at most one new state can be created for each byte of input. This satisfies
 //! our time complexity guarantees, but can lead to unbounded memory growth
 //! proportional to the size of the input. As a stopgap, the DFA is only
 //! allowed to store a fixed number of states. (When the limit is reached, its
 //! states are wiped and continues on, possibly duplicating previous work. If
 //! the limit is reached too frequently, it gives up and hands control off to
 //! another matching engine with fixed memory requirements.)

 #![deny(missing_docs)]
 #![cfg_attr(test, deny(warnings))]
 #![cfg_attr(feature = "pattern", feature(pattern))]
 #![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))]
 #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
        html_favicon_url = "https://www.rust-lang.org/favicon.ico",
        html_root_url = "https://doc.rust-lang.org/regex/")]

 extern crate aho_corasick;
 extern crate memchr;
 extern crate thread_local;
 #[cfg(test)] extern crate quickcheck;
 extern crate regex_syntax as syntax;
 #[cfg(feature = "simd-accel")] extern crate simd;
 extern crate utf8_ranges;

 pub use error::Error;
 pub use re_builder::unicode::*;
 pub use re_set::unicode::*;
 pub use re_unicode::{
     Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed,
     CaptureNames, FindCaptures, FindMatches,
     Replacer, NoExpand, RegexSplits, RegexSplitsN,
     quote, is_match,
 };

 /**
 Match regular expressions on arbitrary bytes.

 This module provides a nearly identical API to the one found in the
 top-level of this crate. There are two important differences:

 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
 is used where `String` would have been used.
 2. Regular expressions are compiled with Unicode support *disabled* by
 default. This means that while Unicode regular expressions can only match valid
 UTF-8, regular expressions in this module can match arbitrary bytes. Unicode
 support can be selectively enabled via the `u` flag in regular expressions
 provided by this sub-module.

 # Example: match null terminated string

 This shows how to find all null-terminated strings in a slice of bytes:

 ```rust
 # use regex::bytes::Regex;
 let re = Regex::new(r"(?P<cstr>[^\x00]+)\x00").unwrap();
 let text = b"foo\x00bar\x00baz\x00";

 // Extract all of the strings without the null terminator from each match.
 // The unwrap is OK here since a match requires the `cstr` capture to match.
 let cstrs: Vec<&[u8]> =
     re.captures_iter(text)
       .map(|c| c.name("cstr").unwrap())
       .collect();
 assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
 ```

 # Example: selectively enable Unicode support

 This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
 string (e.g., to extract a title from a Matroska file):

 ```rust
 # use std::str;
 # use regex::bytes::Regex;
 let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap();
 let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
 let caps = re.captures(text).unwrap();

 // Notice that despite the `.*` at the end, it will only match valid UTF-8
 // because Unicode mode was enabled with the `u` flag. Without the `u` flag,
 // the `.*` would match the rest of the bytes.
 assert_eq!((7, 10), caps.pos(1).unwrap());

 // If there was a match, Unicode mode guarantees that `title` is valid UTF-8.
 let title = str::from_utf8(caps.at(1).unwrap()).unwrap();
 assert_eq!("☃", title);
 ```

 In general, if the Unicode flag is enabled in a capture group and that capture
 is part of the overall match, then the capture is *guaranteed* to be valid
 UTF-8.

 # Syntax

 The supported syntax is pretty much the same as the syntax for Unicode
 regular expressions with a few changes that make sense for matching arbitrary
 bytes:

 1. The `u` flag is *disabled* by default, but can be selectively enabled. (The
 opposite is true for the main `Regex` type.) Disabling the `u` flag is said to
 invoke "ASCII compatible" mode.
 2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character
 classes are allowed.
 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
 revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
 to `[[:digit:]]` and `\s` maps to `[[:space:]]`.
 4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
 determine whether a byte is a word byte or not.
 5. Hexadecimal notation can be used to specify arbitrary bytes instead of
 Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
 literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
 matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation.
 6. `.` matches any *byte* except for `\n` instead of any codepoint. When the
 `s` flag is enabled, `.` matches any byte.

 # Performance

 In general, one should expect performance on `&[u8]` to be roughly similar to
 performance on `&str`.
 */
 pub mod bytes {
     pub use re_builder::bytes::*;
     pub use re_set::bytes::*;
     pub use re_bytes::*;
 }

 mod backtrack;
 mod utf8;
 mod compile;
 mod dfa;
 mod error;
 mod exec;
 mod expand;
 mod freqs;
 mod input;
 mod literals;
 #[cfg(feature = "pattern")]
 mod pattern;
 mod pikevm;
 mod prog;
 mod re_builder;
 mod re_bytes;
 mod re_plugin;
 mod re_set;
 mod re_trait;
 mod re_unicode;
 #[cfg(feature = "simd-accel")]
 mod simd_accel;
 #[cfg(not(feature = "simd-accel"))]
 #[path = "simd_fallback/mod.rs"]
 mod simd_accel;
 mod sparse;

 /// The `internal` module exists to support the `regex!` macro and other
 /// suspicious activity, such as testing different matching engines and
 /// supporting the `regex-debug` CLI utility.
 #[doc(hidden)]
 pub mod internal {
     pub use compile::Compiler;
     pub use exec::{Exec, ExecBuilder};
     pub use input::{Char, Input, CharInput, InputAt};
     pub use literals::LiteralSearcher;
     pub use prog::{Program, Inst, EmptyLook, InstRanges};
     pub use re_plugin::Plugin;
     pub use re_unicode::_Regex;
 }