blob: 52ae532ed6794d22ff8b32184105e27afbf8804c [file] [log] [blame] [edit]
use super::{Regex, Region, SearchOptions};
use std::iter::FusedIterator;
impl Regex {
/// Returns the capture groups corresponding to the leftmost-first match
/// in text. Capture group `0` always corresponds to the entire match.
/// If no match is found, then `None` is returned.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
let mut region = Region::new();
self.search_with_options(
text,
0,
text.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
)
.map(|pos| Captures {
text,
region,
offset: pos,
})
}
/// Returns an iterator for each successive non-overlapping match in `text`,
/// returning the start and end byte indices with respect to `text`.
///
/// # Example
///
/// Find the start and end location of every word with exactly 13
/// characters:
///
/// ```rust
/// # use onig::Regex;
/// # fn main() {
/// let text = "Retroactively relinquishing remunerations is reprehensible.";
/// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
/// println!("{:?}", pos);
/// }
/// // Output:
/// // (0, 13)
/// // (14, 27)
/// // (28, 41)
/// // (45, 58)
/// # }
/// ```
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
FindMatches {
regex: self,
region: Region::new(),
text,
last_end: 0,
last_match_end: None,
}
}
/// Returns an iterator over all the non-overlapping capture groups matched
/// in `text`. This is operationally the same as `find_iter` (except it
/// yields information about submatches).
///
/// # Example
///
/// We can use this to find all movie titles and their release years in
/// some text, where the movie is formatted like "'Title' (xxxx)":
///
/// ```rust
/// # use onig::Regex;
/// # fn main() {
/// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")
/// .unwrap();
/// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
/// for caps in re.captures_iter(text) {
/// println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2));
/// }
/// // Output:
/// // Movie: Citizen Kane, Released: 1941
/// // Movie: The Wizard of Oz, Released: 1939
/// // Movie: M, Released: 1931
/// # }
/// ```
pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> {
FindCaptures {
regex: self,
text,
last_end: 0,
last_match_end: None,
}
}
/// Returns an iterator of substrings of `text` delimited by a match
/// of the regular expression.
/// Namely, each element of the iterator corresponds to text that *isn't*
/// matched by the regular expression.
///
/// This method will *not* copy the text given.
///
/// # Example
///
/// To split a string delimited by arbitrary amounts of spaces or tabs:
///
/// ```rust
/// # use onig::Regex;
/// # fn main() {
/// let re = Regex::new(r"[ \t]+").unwrap();
/// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
/// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
/// # }
/// ```
pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
RegexSplits {
finder: self.find_iter(text),
last: 0,
}
}
/// Returns an iterator of at most `limit` substrings of `text` delimited
/// by a match of the regular expression. (A `limit` of `0` will return no
/// substrings.)
/// Namely, each element of the iterator corresponds to text that *isn't*
/// matched by the regular expression.
/// The remainder of the string that is not split will be the last element
/// in the iterator.
///
/// This method will *not* copy the text given.
///
/// # Example
///
/// Get the first two words in some text:
///
/// ```rust
/// # use onig::Regex;
/// # fn main() {
/// let re = Regex::new(r"\W+").unwrap();
/// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
/// assert_eq!(fields, vec!("Hey", "How", "are you?"));
/// # }
/// ```
pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> {
RegexSplitsN {
splits: self.split(text),
n: limit,
}
}
/// Scan the given slice, capturing into the given region and
/// executing a callback for each match.
pub fn scan_with_region<F>(
&self,
to_search: &str,
region: &mut Region,
options: SearchOptions,
mut callback: F,
) -> i32
where
F: Fn(i32, i32, &Region) -> bool,
{
use onig_sys::{onig_scan, OnigRegion};
use std::os::raw::{c_int, c_void};
// Find the bounds of the string we're searching
let start = to_search.as_ptr();
let end = to_search[to_search.len()..].as_ptr();
unsafe extern "C" fn scan_cb<F>(
i: c_int,
j: c_int,
r: *mut OnigRegion,
ud: *mut c_void,
) -> c_int
where
F: Fn(i32, i32, &Region) -> bool,
{
let region = Region::clone_from_raw(r);
let callback = &*(ud as *mut F);
if callback(i, j, &region) {
0
} else {
-1
}
}
unsafe {
onig_scan(
self.raw,
start,
end,
(&mut region.raw) as *mut ::onig_sys::OnigRegion,
options.bits(),
Some(scan_cb::<F>),
&mut callback as *mut F as *mut c_void,
)
}
}
/// Scan a Pattern and Observe Captures
///
/// The scan function takes a haystack `to_search` and invokes the
/// given `callback` for each capture of this expression.
pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB)
where
CB: Fn(i32, Captures<'t>) -> bool,
{
let mut region = Region::new();
self.scan_with_region(
to_search,
&mut region,
SearchOptions::SEARCH_OPTION_NONE,
|n, s, region| {
let captures = Captures {
text: to_search,
region: region.clone(),
offset: s as usize,
};
callback(n, captures)
},
);
}
}
/// Captures represents a group of captured strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
/// index corresponds to the next capture group in the regex. Positions
/// returned from a capture group are always byte indices.
///
/// `'t` is the lifetime of the matched text.
#[derive(Debug)]
pub struct Captures<'t> {
text: &'t str,
region: Region,
offset: usize,
}
impl<'t> Captures<'t> {
/// Returns the start and end positions of the Nth capture group. Returns
/// `None` if i is not a valid capture group or if the capture group did
/// not match anything. The positions returned are always byte indices with
/// respect to the original string matched.
pub fn pos(&self, pos: usize) -> Option<(usize, usize)> {
self.region.pos(pos)
}
/// Returns the matched string for the capture group `i`. If `i` isn't
/// a valid capture group or didn't match anything, then `None` is returned.
pub fn at(&self, pos: usize) -> Option<&'t str> {
self.pos(pos).map(|(beg, end)| &self.text[beg..end])
}
/// Returns the number of captured groups.
pub fn len(&self) -> usize {
self.region.len()
}
/// Returns true if and only if there are no captured groups.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Creates an iterator of all the capture groups in order of appearance in
/// the regular expression.
pub fn iter(&'t self) -> SubCaptures<'t> {
SubCaptures { idx: 0, caps: self }
}
/// Creates an iterator of all the capture group positions in order of
/// appearance in the regular expression. Positions are byte indices in
/// terms of the original string matched.
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
SubCapturesPos { idx: 0, caps: self }
}
/// Offset of the captures within the given string slice.
pub fn offset(&self) -> usize {
self.offset
}
}
/// An iterator over capture groups for a particular match of a regular
/// expression.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCaptures<'t> {
idx: usize,
caps: &'t Captures<'t>,
}
impl<'t> Iterator for SubCaptures<'t> {
type Item = Option<&'t str>;
fn next(&mut self) -> Option<Option<&'t str>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.at(self.idx - 1))
} else {
None
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let size = self.caps.len();
(size, Some(size))
}
fn count(self) -> usize {
self.caps.len()
}
}
impl<'t> FusedIterator for SubCaptures<'t> {}
impl<'t> ExactSizeIterator for SubCaptures<'t> {}
/// An iterator over capture group positions for a particular match of
/// a regular expression.
///
/// Positions are byte indices in terms of the original
/// string matched. `'t` is the lifetime of the matched text.
pub struct SubCapturesPos<'t> {
idx: usize,
caps: &'t Captures<'t>,
}
impl<'t> Iterator for SubCapturesPos<'t> {
type Item = Option<(usize, usize)>;
fn next(&mut self) -> Option<Option<(usize, usize)>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.pos(self.idx - 1))
} else {
None
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let size = self.caps.len();
(size, Some(size))
}
fn count(self) -> usize {
self.caps.len()
}
}
impl<'t> FusedIterator for SubCapturesPos<'t> {}
impl<'t> ExactSizeIterator for SubCapturesPos<'t> {}
/// An iterator over all non-overlapping matches for a particular string.
///
/// The iterator yields a tuple of integers corresponding to the start and end
/// of the match. The indices are byte offsets. The iterator stops when no more
/// matches can be found.
///
/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
/// of the matched string.
pub struct FindMatches<'r, 't> {
regex: &'r Regex,
region: Region,
text: &'t str,
last_end: usize,
last_match_end: Option<usize>,
}
impl<'r, 't> Iterator for FindMatches<'r, 't> {
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if self.last_end > self.text.len() {
return None;
}
self.region.clear();
self.regex.search_with_options(
self.text,
self.last_end,
self.text.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut self.region),
)?;
let (s, e) = self.region.pos(0).unwrap();
// Don't accept empty matches immediately following the last match.
// i.e., no infinite loops please.
if e == s && self.last_match_end.map_or(false, |l| l == e) {
self.last_end += self.text[self.last_end..]
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
return self.next();
} else {
self.last_end = e;
self.last_match_end = Some(e);
}
Some((s, e))
}
}
impl<'r, 't> FusedIterator for FindMatches<'r, 't> {}
/// An iterator that yields all non-overlapping capture groups matching a
/// particular regular expression.
///
/// The iterator stops when no more matches can be found.
///
/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
/// of the matched string.
pub struct FindCaptures<'r, 't> {
regex: &'r Regex,
text: &'t str,
last_end: usize,
last_match_end: Option<usize>,
}
impl<'r, 't> Iterator for FindCaptures<'r, 't> {
type Item = Captures<'t>;
fn next(&mut self) -> Option<Captures<'t>> {
if self.last_end > self.text.len() {
return None;
}
let mut region = Region::new();
let r = self.regex.search_with_options(
self.text,
self.last_end,
self.text.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
)?;
let (s, e) = region.pos(0).unwrap();
// Don't accept empty matches immediately following the last match.
// i.e., no infinite loops please.
if e == s && self.last_match_end.map_or(false, |l| l == e) {
self.last_end += self.text[self.last_end..]
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
return self.next();
} else {
self.last_end = e;
self.last_match_end = Some(e);
}
Some(Captures {
text: self.text,
region,
offset: r,
})
}
}
impl<'r, 't> FusedIterator for FindCaptures<'r, 't> {}
/// Yields all substrings delimited by a regular expression match.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the string being split.
pub struct RegexSplits<'r, 't> {
finder: FindMatches<'r, 't>,
last: usize,
}
impl<'r, 't> Iterator for RegexSplits<'r, 't> {
type Item = &'t str;
fn next(&mut self) -> Option<&'t str> {
let text = self.finder.text;
match self.finder.next() {
None => {
if self.last >= text.len() {
None
} else {
let s = &text[self.last..];
self.last = text.len();
Some(s)
}
}
Some((s, e)) => {
let matched = &text[self.last..s];
self.last = e;
Some(matched)
}
}
}
}
impl<'r, 't> FusedIterator for RegexSplits<'r, 't> {}
/// Yields at most `N` substrings delimited by a regular expression match.
///
/// The last substring will be whatever remains after splitting.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the string being split.
pub struct RegexSplitsN<'r, 't> {
splits: RegexSplits<'r, 't>,
n: usize,
}
impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
type Item = &'t str;
fn next(&mut self) -> Option<&'t str> {
if self.n == 0 {
return None;
}
self.n -= 1;
if self.n == 0 {
let text = self.splits.finder.text;
Some(&text[self.splits.last..])
} else {
self.splits.next()
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(0, Some(self.n))
}
}
impl<'r, 't> FusedIterator for RegexSplitsN<'r, 't> {}
#[cfg(test)]
mod tests {
use super::super::*;
#[test]
fn test_regex_captures() {
let regex = Regex::new("e(l+)|(r+)").unwrap();
let captures = regex.captures("hello").unwrap();
assert_eq!(captures.len(), 3);
assert_eq!(captures.is_empty(), false);
let pos1 = captures.pos(0).unwrap();
let pos2 = captures.pos(1).unwrap();
let pos3 = captures.pos(2);
assert_eq!(pos1, (1, 4));
assert_eq!(pos2, (2, 4));
assert_eq!(pos3, None);
let str1 = captures.at(0).unwrap();
let str2 = captures.at(1).unwrap();
let str3 = captures.at(2);
assert_eq!(str1, "ell");
assert_eq!(str2, "ll");
assert_eq!(str3, None);
}
#[test]
fn test_regex_subcaptures() {
let regex = Regex::new("e(l+)").unwrap();
let captures = regex.captures("hello").unwrap();
let caps = captures.iter().collect::<Vec<_>>();
assert_eq!(caps[0], Some("ell"));
assert_eq!(caps[1], Some("ll"));
assert_eq!(caps.len(), 2);
}
#[test]
fn test_regex_subcapturespos() {
let regex = Regex::new("e(l+)").unwrap();
let captures = regex.captures("hello").unwrap();
let caps = captures.iter_pos().collect::<Vec<_>>();
assert_eq!(caps[0], Some((1, 4)));
assert_eq!(caps[1], Some((2, 4)));
assert_eq!(caps.len(), 2);
}
#[test]
fn test_find_iter() {
let re = Regex::new(r"\d+").unwrap();
let ms = re.find_iter("a12b2").collect::<Vec<_>>();
assert_eq!(ms, vec![(1, 3), (4, 5)]);
}
#[test]
fn test_find_iter_one_zero_length() {
let re = Regex::new(r"\d*").unwrap();
let ms = re.find_iter("a1b2").collect::<Vec<_>>();
assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]);
}
#[test]
fn test_find_iter_many_zero_length() {
let re = Regex::new(r"\d*").unwrap();
let ms = re.find_iter("a1bbb2").collect::<Vec<_>>();
assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]);
}
#[test]
fn test_find_iter_empty_after_match() {
let re = Regex::new(r"b|(?=,)").unwrap();
let ms = re.find_iter("ba,").collect::<Vec<_>>();
assert_eq!(ms, vec![(0, 1), (2, 2)]);
}
#[test]
fn test_zero_length_matches_jumps_past_match_location() {
let re = Regex::new(r"\b").unwrap();
let matches = re.find_iter("test string").collect::<Vec<_>>();
assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]);
}
#[test]
fn test_captures_iter() {
let re = Regex::new(r"\d+").unwrap();
let ms = re.captures_iter("a12b2").collect::<Vec<_>>();
assert_eq!(ms[0].pos(0).unwrap(), (1, 3));
assert_eq!(ms[1].pos(0).unwrap(), (4, 5));
}
#[test]
fn test_captures_stores_match_offset() {
let reg = Regex::new(r"\d+\.(\d+)").unwrap();
let captures = reg.captures("100 - 3.1415 / 2.0").unwrap();
assert_eq!(6, captures.offset());
let all_caps = reg
.captures_iter("1 - 3234.3 * 123.2 - 100")
.map(|cap| cap.offset())
.collect::<Vec<_>>();
assert_eq!(vec![4, 13], all_caps);
}
}