blob: a37445271e107104f9c2a2bf1fddd7f41e4c010e [file] [log] [blame] [edit]
use std::cell::RefCell;
use std::collections::HashMap;
use std::fmt;
use std::ops::Index;
use std::sync::Arc;
use log::debug;
use pcre2_sys::{
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE,
PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET,
PCRE2_NEWLINE_ANYCRLF,
};
use thread_local::CachedThreadLocal;
use crate::error::Error;
use crate::ffi::{Code, CompileContext, MatchConfig, MatchData};
/// Match represents a single match of a regex in a subject string.
///
/// The lifetime parameter `'s` refers to the lifetime of the matched portion
/// of the subject string.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct Match<'s> {
subject: &'s [u8],
start: usize,
end: usize,
}
impl<'s> Match<'s> {
/// Returns the starting byte offset of the match in the subject.
#[inline]
pub fn start(&self) -> usize {
self.start
}
/// Returns the ending byte offset of the match in the subject.
#[inline]
pub fn end(&self) -> usize {
self.end
}
/// Returns the matched portion of the subject string.
#[inline]
pub fn as_bytes(&self) -> &'s [u8] {
&self.subject[self.start..self.end]
}
/// Creates a new match from the given subject string and byte offsets.
fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> {
Match { subject, start, end }
}
#[cfg(test)]
fn as_pair(&self) -> (usize, usize) {
(self.start, self.end)
}
}
#[derive(Clone, Debug)]
struct Config {
/// PCRE2_CASELESS
caseless: bool,
/// PCRE2_DOTALL
dotall: bool,
/// PCRE2_EXTENDED
extended: bool,
/// PCRE2_MULTILINE
multi_line: bool,
/// PCRE2_NEWLINE_ANYCRLF
crlf: bool,
/// PCRE2_UCP
ucp: bool,
/// PCRE2_UTF
utf: bool,
/// PCRE2_NO_UTF_CHECK
utf_check: bool,
/// use pcre2_jit_compile
jit: JITChoice,
/// Match-time specific configuration knobs.
match_config: MatchConfig,
}
#[derive(Clone, Debug)]
enum JITChoice {
/// Never do JIT compilation.
Never,
/// Always do JIT compilation and return an error if it fails.
Always,
/// Attempt to do JIT compilation but silently fall back to non-JIT.
Attempt,
}
impl Default for Config {
fn default() -> Config {
Config {
caseless: false,
dotall: false,
extended: false,
multi_line: false,
crlf: false,
ucp: false,
utf: false,
utf_check: true,
jit: JITChoice::Never,
match_config: MatchConfig::default(),
}
}
}
/// A builder for configuring the compilation of a PCRE2 regex.
#[derive(Clone, Debug)]
pub struct RegexBuilder {
config: Config,
}
impl RegexBuilder {
/// Create a new builder with a default configuration.
pub fn new() -> RegexBuilder {
RegexBuilder { config: Config::default() }
}
/// Compile the given pattern into a PCRE regex using the current
/// configuration.
///
/// If there was a problem compiling the pattern, then an error is
/// returned.
pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
let mut options = 0;
if self.config.caseless {
options |= PCRE2_CASELESS;
}
if self.config.dotall {
options |= PCRE2_DOTALL;
}
if self.config.extended {
options |= PCRE2_EXTENDED;
}
if self.config.multi_line {
options |= PCRE2_MULTILINE;
}
if self.config.ucp {
options |= PCRE2_UCP;
options |= PCRE2_UTF;
}
if self.config.utf {
options |= PCRE2_UTF;
}
let mut ctx = CompileContext::new();
if self.config.crlf {
ctx.set_newline(PCRE2_NEWLINE_ANYCRLF)
.expect("PCRE2_NEWLINE_ANYCRLF is a legal value");
}
let mut code = Code::new(pattern, options, ctx)?;
match self.config.jit {
JITChoice::Never => {} // fallthrough
JITChoice::Always => {
code.jit_compile()?;
}
JITChoice::Attempt => {
if let Err(err) = code.jit_compile() {
debug!("JIT compilation failed: {}", err);
}
}
}
let capture_names = code.capture_names()?;
let mut idx = HashMap::new();
for (i, group) in capture_names.iter().enumerate() {
if let Some(ref name) = *group {
idx.insert(name.to_string(), i);
}
}
Ok(Regex {
config: Arc::new(self.config.clone()),
pattern: pattern.to_string(),
code: Arc::new(code),
capture_names: Arc::new(capture_names),
capture_names_idx: Arc::new(idx),
match_data: CachedThreadLocal::new(),
})
}
/// Enables case insensitive matching.
///
/// If the `utf` option is also set, then Unicode case folding is used
/// to determine case insensitivity. When the `utf` option is not set,
/// then only standard ASCII case insensitivity is considered.
///
/// This option corresponds to the `i` flag.
pub fn caseless(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.caseless = yes;
self
}
/// Enables "dot all" matching.
///
/// When enabled, the `.` metacharacter in the pattern matches any
/// character, include `\n`. When disabled (the default), `.` will match
/// any character except for `\n`.
///
/// This option corresponds to the `s` flag.
pub fn dotall(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.dotall = yes;
self
}
/// Enable "extended" mode in the pattern, where whitespace is ignored.
///
/// This option corresponds to the `x` flag.
pub fn extended(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.extended = yes;
self
}
/// Enable multiline matching mode.
///
/// When enabled, the `^` and `$` anchors will match both at the beginning
/// and end of a subject string, in addition to matching at the start of
/// a line and the end of a line. When disabled, the `^` and `$` anchors
/// will only match at the beginning and end of a subject string.
///
/// This option corresponds to the `m` flag.
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.multi_line = yes;
self
}
/// Enable matching of CRLF as a line terminator.
///
/// When enabled, anchors such as `^` and `$` will match any of the
/// following as a line terminator: `\r`, `\n` or `\r\n`.
///
/// This is disabled by default, in which case, only `\n` is recognized as
/// a line terminator.
pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.crlf = yes;
self
}
/// Enable Unicode matching mode.
///
/// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
/// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
///
/// When set, this implies UTF matching mode. It is not possible to enable
/// Unicode matching mode without enabling UTF matching mode.
///
/// This is disabled by default.
pub fn ucp(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.ucp = yes;
self
}
/// Enable UTF matching mode.
///
/// When enabled, characters are treated as sequences of code units that
/// make up a single codepoint instead of as single bytes. For example,
/// this will cause `.` to match any single UTF-8 encoded codepoint, where
/// as when this is disabled, `.` will any single byte (except for `\n` in
/// both cases, unless "dot all" mode is enabled).
///
/// Note that when UTF matching mode is enabled, every search performed
/// will do a UTF-8 validation check, which can impact performance. The
/// UTF-8 check can be disabled via the `disable_utf_check` option, but it
/// is undefined behavior to enable UTF matching mode and search invalid
/// UTF-8.
///
/// This is disabled by default.
pub fn utf(&mut self, yes: bool) -> &mut RegexBuilder {
self.config.utf = yes;
self
}
/// When UTF matching mode is enabled, this will disable the UTF checking
/// that PCRE2 will normally perform automatically. If UTF matching mode
/// is not enabled, then this has no effect.
///
/// UTF checking is enabled by default when UTF matching mode is enabled.
/// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2
/// will return an error if you attempt to search a subject string that is
/// not valid UTF-8.
///
/// # Safety
///
/// It is undefined behavior to disable the UTF check in UTF matching mode
/// and search a subject string that is not valid UTF-8. When the UTF check
/// is disabled, callers must guarantee that the subject string is valid
/// UTF-8.
pub unsafe fn disable_utf_check(&mut self) -> &mut RegexBuilder {
self.config.utf_check = false;
self
}
/// Enable PCRE2's JIT and return an error if it's not available.
///
/// This generally speeds up matching quite a bit. The downside is that it
/// can increase the time it takes to compile a pattern.
///
/// If the JIT isn't available or if JIT compilation returns an error, then
/// regex compilation will fail with the corresponding error.
///
/// This is disabled by default, and always overrides `jit_if_available`.
pub fn jit(&mut self, yes: bool) -> &mut RegexBuilder {
if yes {
self.config.jit = JITChoice::Always;
} else {
self.config.jit = JITChoice::Never;
}
self
}
/// Enable PCRE2's JIT if it's available.
///
/// This generally speeds up matching quite a bit. The downside is that it
/// can increase the time it takes to compile a pattern.
///
/// If the JIT isn't available or if JIT compilation returns an error,
/// then a debug message with the error will be emitted and the regex will
/// otherwise silently fall back to non-JIT matching.
///
/// This is disabled by default, and always overrides `jit`.
pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexBuilder {
if yes {
self.config.jit = JITChoice::Attempt;
} else {
self.config.jit = JITChoice::Never;
}
self
}
/// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
/// not enabled, then this has no effect.
///
/// When `None` is given, no custom JIT stack will be created, and instead,
/// the default JIT stack is used. When the default is used, its maximum
/// size is 32 KB.
///
/// When this is set, then a new JIT stack will be created with the given
/// maximum size as its limit.
///
/// Increasing the stack size can be useful for larger regular expressions.
///
/// By default, this is set to `None`.
pub fn max_jit_stack_size(
&mut self,
bytes: Option<usize>,
) -> &mut RegexBuilder {
self.config.match_config.max_jit_stack_size = bytes;
self
}
}
/// A compiled PCRE2 regular expression.
///
/// This regex is safe to use from multiple threads simultaneously. For top
/// performance, it is better to clone a new regex for each thread.
pub struct Regex {
/// The configuration used to build the regex.
config: Arc<Config>,
/// The original pattern string.
pattern: String,
/// The underlying compiled PCRE2 object.
code: Arc<Code>,
/// The capture group names for this regex.
capture_names: Arc<Vec<Option<String>>>,
/// A map from capture group name to capture group index.
capture_names_idx: Arc<HashMap<String, usize>>,
/// Mutable scratch data used by PCRE2 during matching.
///
/// We use the same strategy as Rust's regex crate here, such that each
/// thread gets its own match data to support using a Regex object from
/// multiple threads simultaneously. If some match data doesn't exist for
/// a thread, then a new one is created on demand.
match_data: CachedThreadLocal<RefCell<MatchData>>,
}
impl Clone for Regex {
fn clone(&self) -> Regex {
Regex {
config: Arc::clone(&self.config),
pattern: self.pattern.clone(),
code: Arc::clone(&self.code),
capture_names: Arc::clone(&self.capture_names),
capture_names_idx: Arc::clone(&self.capture_names_idx),
match_data: CachedThreadLocal::new(),
}
}
}
impl fmt::Debug for Regex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Regex({:?})", self.pattern)
}
}
impl Regex {
/// Compiles a regular expression using the default configuration.
///
/// Once compiled, it can be used repeatedly to search, split or replace
/// text in a string.
///
/// If an invalid expression is given, then an error is returned.
///
/// To configure compilation options for the regex, use the
/// [`RegexBuilder`](struct.RegexBuilder.html).
pub fn new(pattern: &str) -> Result<Regex, Error> {
RegexBuilder::new().build(pattern)
}
/// Returns true if and only if the regex matches the subject string given.
///
/// # Example
///
/// Test if some text contains at least one word with exactly 13 ASCII word
/// bytes:
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let text = b"I categorically deny having triskaidekaphobia.";
/// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?);
/// # Ok(()) }; example().unwrap()
/// ```
pub fn is_match(&self, subject: &[u8]) -> Result<bool, Error> {
self.is_match_at(subject, 0)
}
/// Returns the start and end byte range of the leftmost-first match in
/// `subject`. If no match exists, then `None` is returned.
///
/// # Example
///
/// Find the start and end location of the first word with exactly 13
/// ASCII word bytes:
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let text = b"I categorically deny having triskaidekaphobia.";
/// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap();
/// assert_eq!((mat.start(), mat.end()), (2, 15));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn find<'s>(
&self,
subject: &'s [u8],
) -> Result<Option<Match<'s>>, Error> {
self.find_at(subject, 0)
}
/// Returns an iterator for each successive non-overlapping match in
/// `subject`, returning the start and end byte indices with respect to
/// `subject`.
///
/// # Example
///
/// Find the start and end location of every word with exactly 13 ASCII
/// word bytes:
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let text = b"Retroactively relinquishing remunerations is reprehensible.";
/// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) {
/// let mat = result?;
/// println!("{:?}", mat);
/// }
/// # Ok(()) }; example().unwrap()
/// ```
pub fn find_iter<'r, 's>(&'r self, subject: &'s [u8]) -> Matches<'r, 's> {
Matches {
re: self,
match_data: self.match_data(),
subject: subject,
last_end: 0,
last_match: None,
}
}
/// Returns the capture groups corresponding to the leftmost-first
/// match in `subject`. Capture group `0` always corresponds to the entire
/// match. If no match is found, then `None` is returned.
///
/// # Examples
///
/// Say you have some text with movie names and their release years,
/// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
/// looking like that, while also extracting the movie name and its release
/// year separately.
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?;
/// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
/// let caps = re.captures(text)?.unwrap();
/// assert_eq!(&caps[1], &b"Citizen Kane"[..]);
/// assert_eq!(&caps[2], &b"1941"[..]);
/// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
/// // You can also access the groups by index using the Index notation.
/// // Note that this will panic on an invalid index.
/// assert_eq!(&caps[1], b"Citizen Kane");
/// assert_eq!(&caps[2], b"1941");
/// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
/// # Ok(()) }; example().unwrap()
/// ```
///
/// Note that the full match is at capture group `0`. Each subsequent
/// capture group is indexed by the order of its opening `(`.
///
/// We can make this example a bit clearer by using *named* capture groups:
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?;
/// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
/// let caps = re.captures(text)?.unwrap();
/// assert_eq!(&caps["title"], &b"Citizen Kane"[..]);
/// assert_eq!(&caps["year"], &b"1941"[..]);
/// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
/// // You can also access the groups by name using the Index notation.
/// // Note that this will panic on an invalid group name.
/// assert_eq!(&caps["title"], b"Citizen Kane");
/// assert_eq!(&caps["year"], b"1941");
/// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
/// # Ok(()) }; example().unwrap()
/// ```
///
/// Here we name the capture groups, which we can access with the `name`
/// method or the `Index` notation with a `&str`. Note that the named
/// capture groups are still accessible with `get` or the `Index` notation
/// with a `usize`.
///
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'s>(
&self,
subject: &'s [u8],
) -> Result<Option<Captures<'s>>, Error> {
let mut locs = self.capture_locations();
Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures {
subject: subject,
locs: locs,
idx: Arc::clone(&self.capture_names_idx),
}))
}
/// Returns an iterator over all the non-overlapping capture groups matched
/// in `subject`. This is operationally the same as `find_iter`, except it
/// yields information about capturing group matches.
///
/// # Example
///
/// We can use this to find all movie titles and their release years in
/// some text, where the movie is formatted like "'Title' (xxxx)":
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use std::str;
///
/// use pcre2::bytes::Regex;
///
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?;
/// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
/// for result in re.captures_iter(text) {
/// let caps = result?;
/// let title = str::from_utf8(&caps["title"]).unwrap();
/// let year = str::from_utf8(&caps["year"]).unwrap();
/// println!("Movie: {:?}, Released: {:?}", title, year);
/// }
/// // Output:
/// // Movie: Citizen Kane, Released: 1941
/// // Movie: The Wizard of Oz, Released: 1939
/// // Movie: M, Released: 1931
/// # Ok(()) }; example().unwrap()
/// ```
pub fn captures_iter<'r, 's>(
&'r self,
subject: &'s [u8],
) -> CaptureMatches<'r, 's> {
CaptureMatches {
re: self,
subject: subject,
last_end: 0,
last_match: None,
}
}
}
/// Advanced or "lower level" search methods.
impl Regex {
/// Returns the same as is_match, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn is_match_at(
&self,
subject: &[u8],
start: usize,
) -> Result<bool, Error> {
assert!(
start <= subject.len(),
"start ({}) must be <= subject.len() ({})",
start,
subject.len()
);
let mut options = 0;
if !self.config.utf_check {
options |= PCRE2_NO_UTF_CHECK;
}
let match_data = self.match_data();
let mut match_data = match_data.borrow_mut();
// SAFETY: The only unsafe PCRE2 option we potentially use here is
// PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the
// `disable_utf_check` method, which propagates the safety contract to
// the caller.
Ok(unsafe { match_data.find(&self.code, subject, start, options)? })
}
/// Returns the same as find, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn find_at<'s>(
&self,
subject: &'s [u8],
start: usize,
) -> Result<Option<Match<'s>>, Error> {
self.find_at_with_match_data(self.match_data(), subject, start)
}
/// Like find_at, but accepts match data instead of acquiring one itself.
///
/// This is useful for implementing the iterator, which permits avoiding
/// the synchronization overhead of acquiring the match data.
#[inline(always)]
fn find_at_with_match_data<'s>(
&self,
match_data: &RefCell<MatchData>,
subject: &'s [u8],
start: usize,
) -> Result<Option<Match<'s>>, Error> {
assert!(
start <= subject.len(),
"start ({}) must be <= subject.len() ({})",
start,
subject.len()
);
let mut options = 0;
if !self.config.utf_check {
options |= PCRE2_NO_UTF_CHECK;
}
let mut match_data = match_data.borrow_mut();
// SAFETY: The only unsafe PCRE2 option we potentially use here is
// PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the
// `disable_utf_check` method, which propagates the safety contract to
// the caller.
if unsafe { !match_data.find(&self.code, subject, start, options)? } {
return Ok(None);
}
let ovector = match_data.ovector();
let (s, e) = (ovector[0], ovector[1]);
Ok(Some(Match::new(&subject[s..e], s, e)))
}
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
/// [`Captures`](struct.Captures.html) in order to amortize allocations.
///
/// To create a `CaptureLocations` value, use the
/// `Regex::capture_locations` method.
///
/// This returns the overall match if this was successful, which is always
/// equivalent to the `0`th capture group.
pub fn captures_read<'s>(
&self,
locs: &mut CaptureLocations,
subject: &'s [u8],
) -> Result<Option<Match<'s>>, Error> {
self.captures_read_at(locs, subject, 0)
}
/// Returns the same as `captures_read`, but starts the search at the given
/// offset and populates the capture locations given.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn captures_read_at<'s>(
&self,
locs: &mut CaptureLocations,
subject: &'s [u8],
start: usize,
) -> Result<Option<Match<'s>>, Error> {
assert!(
start <= subject.len(),
"start ({}) must be <= subject.len() ({})",
start,
subject.len()
);
let mut options = 0;
if !self.config.utf_check {
options |= PCRE2_NO_UTF_CHECK;
}
// SAFETY: The only unsafe PCRE2 option we potentially use here is
// PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the
// `disable_utf_check` method, which propagates the safety contract to
// the caller.
if unsafe { !locs.data.find(&self.code, subject, start, options)? } {
return Ok(None);
}
let ovector = locs.data.ovector();
let (s, e) = (ovector[0], ovector[1]);
Ok(Some(Match::new(&subject[s..e], s, e)))
}
}
/// Auxiliary methods.
impl Regex {
/// Returns the original pattern string for this regex.
pub fn as_str(&self) -> &str {
&self.pattern
}
/// Returns a sequence of all capturing groups and their names, if present.
///
/// The length of the slice returned is always equal to the result of
/// `captures_len`, which is the number of capturing groups (including the
/// capturing group for the entire pattern).
///
/// Each entry in the slice is the name of the corresponding capturing
/// group, if one exists. The first capturing group (at index `0`) is
/// always unnamed.
///
/// Capturing groups are indexed by the order of the opening parenthesis.
pub fn capture_names(&self) -> &[Option<String>] {
&self.capture_names
}
/// Returns the number of capturing groups in the pattern.
///
/// This is always 1 more than the number of syntactic groups in the
/// pattern, since the first group always corresponds to the entire match.
pub fn captures_len(&self) -> usize {
self.code.capture_count().expect("a valid capture count from PCRE2")
}
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
CaptureLocations {
code: Arc::clone(&self.code),
data: self.new_match_data(),
}
}
fn match_data(&self) -> &RefCell<MatchData> {
let create = || RefCell::new(self.new_match_data());
self.match_data.get_or(create)
}
fn new_match_data(&self) -> MatchData {
MatchData::new(self.config.match_config.clone(), &self.code)
}
}
/// CaptureLocations is a low level representation of the raw offsets of each
/// submatch.
///
/// Primarily, this type is useful when using `Regex` APIs such as
/// `captures_read`, which permits amortizing the allocation in which capture
/// match locations are stored.
///
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
pub struct CaptureLocations {
code: Arc<Code>,
data: MatchData,
}
impl Clone for CaptureLocations {
fn clone(&self) -> CaptureLocations {
CaptureLocations {
code: Arc::clone(&self.code),
data: MatchData::new(self.data.config().clone(), &self.code),
}
}
}
impl fmt::Debug for CaptureLocations {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut offsets: Vec<Option<usize>> = vec![];
for &offset in self.data.ovector() {
if offset == PCRE2_UNSET {
offsets.push(None);
} else {
offsets.push(Some(offset));
}
}
write!(f, "CaptureLocations(")?;
f.debug_list().entries(offsets).finish()?;
write!(f, ")")
}
}
impl CaptureLocations {
/// Returns the start and end positions of the Nth capture group.
///
/// This returns `None` if `i` is not a valid capture group or if the
/// capture group did not match anything.
///
/// The positions returned are always byte indices with respect to the
/// original subject string matched.
#[inline]
pub fn get(&self, i: usize) -> Option<(usize, usize)> {
let ovec = self.data.ovector();
let s = match ovec.get(i * 2) {
None => return None,
Some(&s) if s == PCRE2_UNSET => return None,
Some(&s) => s,
};
let e = match ovec.get(i * 2 + 1) {
None => return None,
Some(&e) if e == PCRE2_UNSET => return None,
Some(&e) => e,
};
Some((s, e))
}
/// Returns the total number of capturing groups.
///
/// This is always at least `1` since every regex has at least `1`
/// capturing group that corresponds to the entire match.
#[inline]
pub fn len(&self) -> usize {
self.data.ovector().len() / 2
}
}
/// Captures represents a group of captured byte strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
/// index corresponds to the next capture group in the regex. If a capture
/// group is named, then the matched byte string is *also* available via the
/// `name` method. (Note that the 0th capture is always unnamed and so must be
/// accessed with the `get` method.)
///
/// Positions returned from a capture group are always byte indices.
///
/// `'s` is the lifetime of the matched subject string.
pub struct Captures<'s> {
subject: &'s [u8],
locs: CaptureLocations,
idx: Arc<HashMap<String, usize>>,
}
impl<'s> Captures<'s> {
/// Returns the match associated with the capture group at index `i`. If
/// `i` does not correspond to a capture group, or if the capture group
/// did not participate in the match, then `None` is returned.
///
/// # Examples
///
/// Get the text of the match with a default of an empty string if this
/// group didn't participate in the match:
///
/// ```rust
/// # fn example() -> Result<(), ::pcre2::Error> {
/// use pcre2::bytes::Regex;
///
/// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?;
/// let caps = re.captures(b"abc123")?.unwrap();
///
/// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
/// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
/// assert_eq!(text1, &b"123"[..]);
/// assert_eq!(text2, &b""[..]);
/// # Ok(()) }; example().unwrap()
/// ```
pub fn get(&self, i: usize) -> Option<Match<'s>> {
self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e))
}
/// Returns the match for the capture group named `name`. If `name` isn't a
/// valid capture group or didn't match anything, then `None` is returned.
pub fn name(&self, name: &str) -> Option<Match<'s>> {
self.idx.get(name).and_then(|&i| self.get(i))
}
/// Returns the number of captured groups.
///
/// This is always at least `1`, since every regex has at least one capture
/// group that corresponds to the full match.
#[inline]
pub fn len(&self) -> usize {
self.locs.len()
}
}
impl<'s> fmt::Debug for Captures<'s> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
}
}
struct CapturesDebug<'c, 's: 'c>(&'c Captures<'s>);
impl<'c, 's> fmt::Debug for CapturesDebug<'c, 's> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn escape_bytes(bytes: &[u8]) -> String {
let mut s = String::new();
for &b in bytes {
s.push_str(&escape_byte(b));
}
s
}
fn escape_byte(byte: u8) -> String {
use std::ascii::escape_default;
let escaped: Vec<u8> = escape_default(byte).collect();
String::from_utf8_lossy(&escaped).into_owned()
}
// We'd like to show something nice here, even if it means an
// allocation to build a reverse index.
let slot_to_name: HashMap<&usize, &String> =
self.0.idx.iter().map(|(a, b)| (b, a)).collect();
let mut map = f.debug_map();
for slot in 0..self.0.len() {
let m = self.0.locs.get(slot).map(|(s, e)| {
escape_bytes(&self.0.subject[s..e])
});
if let Some(name) = slot_to_name.get(&slot) {
map.entry(&name, &m);
} else {
map.entry(&slot, &m);
}
}
map.finish()
}
}
/// Get a group by index.
///
/// `'s` is the lifetime of the matched subject string.
///
/// The subject can't outlive the `Captures` object if this method is
/// used, because of how `Index` is defined (normally `a[i]` is part
/// of `a` and can't outlive it); to do that, use `get()` instead.
///
/// # Panics
///
/// If there is no group at the given index.
impl<'s> Index<usize> for Captures<'s> {
type Output = [u8];
fn index(&self, i: usize) -> &[u8] {
self.get(i).map(|m| m.as_bytes())
.unwrap_or_else(|| panic!("no group at index '{}'", i))
}
}
/// Get a group by name.
///
/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime
/// of the group name (the index).
///
/// The text can't outlive the `Captures` object if this method is
/// used, because of how `Index` is defined (normally `a[i]` is part
/// of `a` and can't outlive it); to do that, use `name` instead.
///
/// # Panics
///
/// If there is no group named by the given value.
impl<'s, 'i> Index<&'i str> for Captures<'s> {
type Output = [u8];
fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
self.name(name).map(|m| m.as_bytes())
.unwrap_or_else(|| panic!("no group named '{}'", name))
}
}
/// An iterator over all non-overlapping matches for a particular subject
/// string.
///
/// The iterator yields matches (if no error occurred while searching)
/// corresponding to the start and end of the match. The indices are byte
/// offsets. The iterator stops when no more matches can be found.
///
/// `'r` is the lifetime of the compiled regular expression and `'s` is the
/// lifetime of the subject string.
pub struct Matches<'r, 's> {
re: &'r Regex,
match_data: &'r RefCell<MatchData>,
subject: &'s [u8],
last_end: usize,
last_match: Option<usize>,
}
impl<'r, 's> Iterator for Matches<'r, 's> {
type Item = Result<Match<'s>, Error>;
fn next(&mut self) -> Option<Result<Match<'s>, Error>> {
if self.last_end > self.subject.len() {
return None;
}
let res = self.re.find_at_with_match_data(
self.match_data,
self.subject,
self.last_end,
);
let m = match res {
Err(err) => return Some(Err(err)),
Ok(None) => return None,
Ok(Some(m)) => m,
};
if m.start() == m.end() {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = m.end() + 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end()) == self.last_match {
return self.next();
}
} else {
self.last_end = m.end();
}
self.last_match = Some(m.end());
Some(Ok(m))
}
}
/// An iterator that yields all non-overlapping capture groups matching a
/// particular regular expression.
///
/// The iterator stops when no more matches can be found.
///
/// `'r` is the lifetime of the compiled regular expression and `'s` is the
/// lifetime of the subject string.
pub struct CaptureMatches<'r, 's> {
re: &'r Regex,
subject: &'s [u8],
last_end: usize,
last_match: Option<usize>,
}
impl<'r, 's> Iterator for CaptureMatches<'r, 's> {
type Item = Result<Captures<'s>, Error>;
fn next(&mut self) -> Option<Result<Captures<'s>, Error>> {
if self.last_end > self.subject.len() {
return None;
}
let mut locs = self.re.capture_locations();
let res = self.re.captures_read_at(
&mut locs,
self.subject,
self.last_end,
);
let m = match res {
Err(err) => return Some(Err(err)),
Ok(None) => return None,
Ok(Some(m)) => m,
};
if m.start() == m.end() {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = m.end() + 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end()) == self.last_match {
return self.next();
}
} else {
self.last_end = m.end();
}
self.last_match = Some(m.end());
Some(Ok(Captures {
subject: self.subject,
locs: locs,
idx: Arc::clone(&self.re.capture_names_idx),
}))
}
}
#[cfg(test)]
mod tests {
use super::{Regex, RegexBuilder};
use crate::is_jit_available;
fn b(string: &str) -> &[u8] {
string.as_bytes()
}
fn find_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> {
let mut tuples = vec![];
for result in re.find_iter(subject) {
let m = result.unwrap();
tuples.push((m.start(), m.end()));
}
tuples
}
fn cap_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> {
let mut tuples = vec![];
for result in re.captures_iter(subject) {
let caps = result.unwrap();
let m = caps.get(0).unwrap();
tuples.push((m.start(), m.end()));
}
tuples
}
#[test]
fn caseless() {
let re = RegexBuilder::new()
.caseless(true)
.build("a")
.unwrap();
assert!(re.is_match(b("A")).unwrap());
let re = RegexBuilder::new()
.caseless(true)
.ucp(true)
.build("β")
.unwrap();
assert!(re.is_match(b("Β")).unwrap());
}
#[test]
fn crlf() {
let re = RegexBuilder::new()
.crlf(true)
.build("a$")
.unwrap();
let m = re.find(b("a\r\n")).unwrap().unwrap();
assert_eq!(m.as_pair(), (0, 1));
}
#[test]
fn dotall() {
let re = RegexBuilder::new()
.dotall(false)
.build(".")
.unwrap();
assert!(!re.is_match(b("\n")).unwrap());
let re = RegexBuilder::new()
.dotall(true)
.build(".")
.unwrap();
assert!(re.is_match(b("\n")).unwrap());
}
#[test]
fn extended() {
let re = RegexBuilder::new()
.extended(true)
.build("a b c")
.unwrap();
assert!(re.is_match(b("abc")).unwrap());
}
#[test]
fn multi_line() {
let re = RegexBuilder::new()
.multi_line(false)
.build("^abc$")
.unwrap();
assert!(!re.is_match(b("foo\nabc\nbar")).unwrap());
let re = RegexBuilder::new()
.multi_line(true)
.build("^abc$")
.unwrap();
assert!(re.is_match(b("foo\nabc\nbar")).unwrap());
}
#[test]
fn ucp() {
let re = RegexBuilder::new()
.ucp(false)
.build(r"\w")
.unwrap();
assert!(!re.is_match(b("β")).unwrap());
let re = RegexBuilder::new()
.ucp(true)
.build(r"\w")
.unwrap();
assert!(re.is_match(b("β")).unwrap());
}
#[test]
fn utf() {
let re = RegexBuilder::new()
.utf(false)
.build(".")
.unwrap();
assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 1));
let re = RegexBuilder::new()
.utf(true)
.build(".")
.unwrap();
assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 2));
}
#[test]
fn jit4lyfe() {
if is_jit_available() {
let re = RegexBuilder::new()
.jit(true)
.build(r"\w")
.unwrap();
assert!(re.is_match(b("a")).unwrap());
} else {
// Check that if JIT isn't enabled, then we get an error if we
// require JIT.
RegexBuilder::new()
.jit(true)
.build(r"\w")
.unwrap_err();
}
}
// Unlike jit4lyfe, this tests that everything works when requesting the
// JIT only if it's available. In jit4lyfe, we require the JIT or fail.
// If the JIT isn't available, then in this test, we simply don't use it.
#[test]
fn jit_if_available() {
let re = RegexBuilder::new()
.jit_if_available(true)
.build(r"\w")
.unwrap();
assert!(re.is_match(b("a")).unwrap());
}
// This tests a regression caused a segfault in the pcre2 library
// https://github.com/BurntSushi/rust-pcre2/issues/10
#[test]
fn jit_test_lazy_alloc_subject() {
let subject: Vec<u8> = vec![];
let re = RegexBuilder::new()
.jit_if_available(true)
.build(r"xxxx|xxxx|xxxx")
.unwrap();
assert!(!re.is_match(&subject).unwrap());
}
#[test]
fn utf_with_invalid_data() {
let re = RegexBuilder::new()
.build(r".")
.unwrap();
assert_eq!(re.find(b"\xFF").unwrap().unwrap().as_pair(), (0, 1));
let re = RegexBuilder::new()
.utf(true)
.build(r".")
.unwrap();
assert!(re.find(b"\xFF").is_err());
}
#[test]
fn capture_names() {
let re = RegexBuilder::new()
.build(
r"(?P<foo>abc)|(def)|(?P<a>ghi)|(?P<springsteen>jkl)"
)
.unwrap();
assert_eq!(re.capture_names().to_vec(), vec![
None,
Some("foo".to_string()),
None,
Some("a".to_string()),
Some("springsteen".to_string()),
]);
// Test our internal map as well.
assert_eq!(re.capture_names_idx.len(), 3);
assert_eq!(re.capture_names_idx["foo"], 1);
assert_eq!(re.capture_names_idx["a"], 3);
assert_eq!(re.capture_names_idx["springsteen"], 4);
}
#[test]
fn captures_get() {
let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
let caps = re.captures(b"abc123").unwrap().unwrap();
let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
assert_eq!(text1, &b"123"[..]);
assert_eq!(text2, &b""[..]);
}
#[test]
fn find_iter_empty() {
let re = Regex::new(r"(?m:^)").unwrap();
assert_eq!(find_iter_tuples(&re, b""), vec![(0, 0)]);
assert_eq!(find_iter_tuples(&re, b"\n"), vec![(0, 0)]);
assert_eq!(find_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]);
assert_eq!(find_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]);
assert_eq!(find_iter_tuples(&re, b"\na\n\n"), vec![
(0, 0), (1, 1), (3, 3),
]);
}
#[test]
fn captures_iter_empty() {
let re = Regex::new(r"(?m:^)").unwrap();
assert_eq!(cap_iter_tuples(&re, b""), vec![(0, 0)]);
assert_eq!(cap_iter_tuples(&re, b"\n"), vec![(0, 0)]);
assert_eq!(cap_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]);
assert_eq!(cap_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]);
assert_eq!(cap_iter_tuples(&re, b"\na\n\n"), vec![
(0, 0), (1, 1), (3, 3),
]);
}
#[test]
fn max_jit_stack_size_does_something() {
if !is_jit_available() {
return;
}
let hundred = "\
ABCDEFGHIJKLMNOPQRSTUVWXY\
ABCDEFGHIJKLMNOPQRSTUVWXY\
ABCDEFGHIJKLMNOPQRSTUVWXY\
ABCDEFGHIJKLMNOPQRSTUVWXY\
";
let hay = format!("{}", hundred.repeat(100));
// First, try a regex that checks that we can blow the JIT stack limit.
let re = RegexBuilder::new()
.ucp(true)
.jit(true)
.max_jit_stack_size(Some(1))
.build(r"((((\w{10})){100}))+")
.unwrap();
let result = re.is_match(hay.as_bytes());
if result.is_ok() {
// Skip this test, since for some reason we weren't able to blow
// the stack limit.
return;
}
let err = result.unwrap_err();
assert!(err.to_string().contains("JIT stack limit reached"));
// Now bump up the JIT stack limit and check that it succeeds.
let re = RegexBuilder::new()
.ucp(true)
.jit(true)
.max_jit_stack_size(Some(1<<20))
.build(r"((((\w{10})){100}))+")
.unwrap();
assert!(re.is_match(hay.as_bytes()).unwrap());
}
}