| /*! |
| Provides routines for interpolating capture group references. |
| |
| That is, if a replacement string contains references like `$foo` or `${foo1}`, |
| then they are replaced with the corresponding capture values for the groups |
| named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` |
| is supported as well, with `1` corresponding to a capture group index and not |
| a name. |
| |
| This module provides the free functions [`string`] and [`bytes`], which |
| interpolate Rust Unicode strings and byte strings, respectively. |
| |
| # Format |
| |
| These routines support two different kinds of capture references: unbraced and |
| braced. |
| |
| For the unbraced format, the format supported is `$ref` where `name` can be |
| any character in the class `[0-9A-Za-z_]`. `ref` is always the longest |
| possible parse. So for example, `$1a` corresponds to the capture group named |
| `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then |
| it is treated as a capture group index itself and not a name. |
| |
| For the braced format, the format supported is `${ref}` where `ref` can be any |
| sequence of bytes except for `}`. If no closing brace occurs, then it is not |
| considered a capture reference. As with the unbraced format, if `ref` matches |
| `^[0-9]+$`, then it is treated as a capture group index and not a name. |
| |
| The braced format is useful for exerting precise control over the name of the |
| capture reference. For example, `${1}a` corresponds to the capture group |
| reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) |
| corresponds to the capture group reference `1a`. The braced format is also |
| useful for expressing capture group names that use characters not supported by |
| the unbraced format. For example, `${foo[bar].baz}` refers to the capture group |
| named `foo[bar].baz`. |
| |
| If a capture group reference is found and it does not refer to a valid capture |
| group, then it will be replaced with the empty string. |
| |
| To write a literal `$`, use `$$`. |
| |
| To be clear, and as exhibited via the type signatures in the routines in this |
| module, it is impossible for a replacement string to be invalid. A replacement |
| string may not have the intended semantics, but the interpolation procedure |
| itself can never fail. |
| */ |
| |
| use alloc::string::String; |
| |
| /// Accepts a replacement string and interpolates capture references with their |
| /// corresponding values. |
| /// |
| /// `append` should be a function that appends the string value of a capture |
| /// group at a particular index to the string given. If the capture group |
| /// index is invalid, then nothing should be appended. |
| /// |
| /// `name_to_index` should be a function that maps a capture group name to a |
| /// capture group index. If the given name doesn't exist, then `None` should |
| /// be returned. |
| /// |
| /// Finally, `dst` is where the final interpolated contents should be written. |
| /// If `replacement` contains no capture group references, then `dst` will be |
| /// equivalent to `replacement`. |
| /// |
| /// See the [module documentation](self) for details about the format |
| /// supported. |
| pub fn string( |
| mut replacement: &str, |
| mut append: impl FnMut(usize, &mut String), |
| mut name_to_index: impl FnMut(&str) -> Option<usize>, |
| dst: &mut String, |
| ) { |
| while !replacement.is_empty() { |
| match replacement.find('$') { |
| None => break, |
| Some(i) => { |
| dst.push_str(&replacement[..i]); |
| replacement = &replacement[i..]; |
| } |
| } |
| // Handle escaping of '$'. |
| if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { |
| dst.push_str("$"); |
| replacement = &replacement[2..]; |
| continue; |
| } |
| debug_assert!(!replacement.is_empty()); |
| let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
| Some(cap_ref) => cap_ref, |
| None => { |
| dst.push_str("$"); |
| replacement = &replacement[1..]; |
| continue; |
| } |
| }; |
| replacement = &replacement[cap_ref.end..]; |
| match cap_ref.cap { |
| Ref::Number(i) => append(i, dst), |
| Ref::Named(name) => { |
| if let Some(i) = name_to_index(name) { |
| append(i, dst); |
| } |
| } |
| } |
| } |
| dst.push_str(replacement); |
| } |
| |
| /* |
| This should be uncommented and used if we ever provide public APIs for |
| searching `&[u8]`. |
| |
| /// Accepts a replacement byte string and interpolates capture references with |
| /// their corresponding values. |
| /// |
| /// `append` should be a function that appends the byte string value of a |
| /// capture group at a particular index to the byte string given. If the |
| /// capture group index is invalid, then nothing should be appended. |
| /// |
| /// `name_to_index` should be a function that maps a capture group name to a |
| /// capture group index. If the given name doesn't exist, then `None` should |
| /// be returned. |
| /// |
| /// Finally, `dst` is where the final interpolated contents should be written. |
| /// If `replacement` contains no capture group references, then `dst` will be |
| /// equivalent to `replacement`. |
| /// |
| /// See the [module documentation](self) for details about the format |
| /// supported. |
| pub fn bytes( |
| mut replacement: &[u8], |
| mut append: impl FnMut(usize, &mut Vec<u8>), |
| mut name_to_index: impl FnMut(&str) -> Option<usize>, |
| dst: &mut Vec<u8>, |
| ) { |
| while !replacement.is_empty() { |
| match replacement.iter().position(|&b| b == b'$') { |
| None => break, |
| Some(i) => { |
| dst.extend_from_slice(&replacement[..i]); |
| replacement = &replacement[i..]; |
| } |
| } |
| // Handle escaping of '$'. |
| if replacement.get(1).map_or(false, |&b| b == b'$') { |
| dst.push(b'$'); |
| replacement = &replacement[2..]; |
| continue; |
| } |
| debug_assert!(!replacement.is_empty()); |
| let cap_ref = match find_cap_ref(replacement) { |
| Some(cap_ref) => cap_ref, |
| None => { |
| dst.push(b'$'); |
| replacement = &replacement[1..]; |
| continue; |
| } |
| }; |
| replacement = &replacement[cap_ref.end..]; |
| match cap_ref.cap { |
| Ref::Number(i) => append(i, dst), |
| Ref::Named(name) => { |
| if let Some(i) = name_to_index(name) { |
| append(i, dst); |
| } |
| } |
| } |
| } |
| dst.extend_from_slice(replacement); |
| } |
| */ |
| |
| /// `CaptureRef` represents a reference to a capture group inside some text. |
| /// The reference is either a capture group name or a number. |
| /// |
| /// It is also tagged with the position in the text following the |
| /// capture reference. |
| #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| struct CaptureRef<'a> { |
| cap: Ref<'a>, |
| end: usize, |
| } |
| |
| /// A reference to a capture group in some text. |
| /// |
| /// e.g., `$2`, `$foo`, `${foo}`. |
| #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| enum Ref<'a> { |
| Named(&'a str), |
| Number(usize), |
| } |
| |
| impl<'a> From<&'a str> for Ref<'a> { |
| fn from(x: &'a str) -> Ref<'a> { |
| Ref::Named(x) |
| } |
| } |
| |
| impl From<usize> for Ref<'static> { |
| fn from(x: usize) -> Ref<'static> { |
| Ref::Number(x) |
| } |
| } |
| |
| /// Parses a possible reference to a capture group name in the given text, |
| /// starting at the beginning of `replacement`. |
| /// |
| /// If no such valid reference could be found, None is returned. |
| /// |
| /// Note that this returns a "possible" reference because this routine doesn't |
| /// know whether the reference is to a valid group or not. If it winds up not |
| /// being a valid reference, then it should be replaced with the empty string. |
| fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
| let mut i = 0; |
| let rep: &[u8] = replacement; |
| if rep.len() <= 1 || rep[0] != b'$' { |
| return None; |
| } |
| i += 1; |
| if rep[i] == b'{' { |
| return find_cap_ref_braced(rep, i + 1); |
| } |
| let mut cap_end = i; |
| while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
| cap_end += 1; |
| } |
| if cap_end == i { |
| return None; |
| } |
| // We just verified that the range 0..cap_end is valid ASCII, so it must |
| // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
| // check via an unchecked conversion or by parsing the number straight from |
| // &[u8]. |
| let cap = core::str::from_utf8(&rep[i..cap_end]) |
| .expect("valid UTF-8 capture name"); |
| Some(CaptureRef { |
| cap: match cap.parse::<usize>() { |
| Ok(i) => Ref::Number(i), |
| Err(_) => Ref::Named(cap), |
| }, |
| end: cap_end, |
| }) |
| } |
| |
| /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening |
| /// brace has been found at `i-1` in `rep`. This then looks for a closing |
| /// brace and returns the capture reference within the brace. |
| fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
| assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); |
| let start = i; |
| while rep.get(i).map_or(false, |&b| b != b'}') { |
| i += 1; |
| } |
| if !rep.get(i).map_or(false, |&b| b == b'}') { |
| return None; |
| } |
| // When looking at braced names, we don't put any restrictions on the name, |
| // so it's possible it could be invalid UTF-8. But a capture group name |
| // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
| // safely return None. |
| let cap = match core::str::from_utf8(&rep[start..i]) { |
| Err(_) => return None, |
| Ok(cap) => cap, |
| }; |
| Some(CaptureRef { |
| cap: match cap.parse::<usize>() { |
| Ok(i) => Ref::Number(i), |
| Err(_) => Ref::Named(cap), |
| }, |
| end: i + 1, |
| }) |
| } |
| |
| /// Returns true if and only if the given byte is allowed in a capture name |
| /// written in non-brace form. |
| fn is_valid_cap_letter(b: u8) -> bool { |
| match b { |
| b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, |
| _ => false, |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use alloc::{string::String, vec, vec::Vec}; |
| |
| use super::{find_cap_ref, CaptureRef}; |
| |
| macro_rules! find { |
| ($name:ident, $text:expr) => { |
| #[test] |
| fn $name() { |
| assert_eq!(None, find_cap_ref($text.as_bytes())); |
| } |
| }; |
| ($name:ident, $text:expr, $capref:expr) => { |
| #[test] |
| fn $name() { |
| assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
| } |
| }; |
| } |
| |
| macro_rules! c { |
| ($name_or_number:expr, $pos:expr) => { |
| CaptureRef { cap: $name_or_number.into(), end: $pos } |
| }; |
| } |
| |
| find!(find_cap_ref1, "$foo", c!("foo", 4)); |
| find!(find_cap_ref2, "${foo}", c!("foo", 6)); |
| find!(find_cap_ref3, "$0", c!(0, 2)); |
| find!(find_cap_ref4, "$5", c!(5, 2)); |
| find!(find_cap_ref5, "$10", c!(10, 3)); |
| // See https://github.com/rust-lang/regex/pull/585 |
| // for more on characters following numbers |
| find!(find_cap_ref6, "$42a", c!("42a", 4)); |
| find!(find_cap_ref7, "${42}a", c!(42, 5)); |
| find!(find_cap_ref8, "${42"); |
| find!(find_cap_ref9, "${42 "); |
| find!(find_cap_ref10, " $0 "); |
| find!(find_cap_ref11, "$"); |
| find!(find_cap_ref12, " "); |
| find!(find_cap_ref13, ""); |
| find!(find_cap_ref14, "$1-$2", c!(1, 2)); |
| find!(find_cap_ref15, "$1_$2", c!("1_", 3)); |
| find!(find_cap_ref16, "$x-$y", c!("x", 2)); |
| find!(find_cap_ref17, "$x_$y", c!("x_", 3)); |
| find!(find_cap_ref18, "${#}", c!("#", 4)); |
| find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); |
| find!(find_cap_ref20, "${¾}", c!("¾", 5)); |
| find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); |
| find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); |
| find!(find_cap_ref23, "${☃}", c!("☃", 6)); |
| find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); |
| find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); |
| find!(find_cap_ref26, "${名字}", c!("名字", 9)); |
| |
| fn interpolate_string( |
| mut name_to_index: Vec<(&'static str, usize)>, |
| caps: Vec<&'static str>, |
| replacement: &str, |
| ) -> String { |
| name_to_index.sort_by_key(|x| x.0); |
| |
| let mut dst = String::new(); |
| super::string( |
| replacement, |
| |i, dst| { |
| if let Some(&s) = caps.get(i) { |
| dst.push_str(s); |
| } |
| }, |
| |name| -> Option<usize> { |
| name_to_index |
| .binary_search_by_key(&name, |x| x.0) |
| .ok() |
| .map(|i| name_to_index[i].1) |
| }, |
| &mut dst, |
| ); |
| dst |
| } |
| |
| /* |
| fn interpolate_bytes( |
| mut name_to_index: Vec<(&'static str, usize)>, |
| caps: Vec<&'static str>, |
| replacement: &str, |
| ) -> String { |
| name_to_index.sort_by_key(|x| x.0); |
| |
| let mut dst = vec![]; |
| super::bytes( |
| replacement.as_bytes(), |
| |i, dst| { |
| if let Some(&s) = caps.get(i) { |
| dst.extend_from_slice(s.as_bytes()); |
| } |
| }, |
| |name| -> Option<usize> { |
| name_to_index |
| .binary_search_by_key(&name, |x| x.0) |
| .ok() |
| .map(|i| name_to_index[i].1) |
| }, |
| &mut dst, |
| ); |
| String::from_utf8(dst).unwrap() |
| } |
| */ |
| |
| macro_rules! interp { |
| ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { |
| #[test] |
| fn $name() { |
| assert_eq!( |
| $expected, |
| interpolate_string($map, $caps, $hay), |
| "interpolate::string failed", |
| ); |
| /* |
| assert_eq!( |
| $expected, |
| interpolate_bytes($map, $caps, $hay), |
| "interpolate::bytes failed", |
| ); |
| */ |
| } |
| }; |
| } |
| |
| interp!( |
| interp1, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test $foo test", |
| "test xxx test", |
| ); |
| |
| interp!( |
| interp2, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test$footest", |
| "test", |
| ); |
| |
| interp!( |
| interp3, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test${foo}test", |
| "testxxxtest", |
| ); |
| |
| interp!( |
| interp4, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test$2test", |
| "test", |
| ); |
| |
| interp!( |
| interp5, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test${2}test", |
| "testxxxtest", |
| ); |
| |
| interp!( |
| interp6, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test $$foo test", |
| "test $foo test", |
| ); |
| |
| interp!( |
| interp7, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "test $foo", |
| "test xxx", |
| ); |
| |
| interp!( |
| interp8, |
| vec![("foo", 2)], |
| vec!["", "", "xxx"], |
| "$foo test", |
| "xxx test", |
| ); |
| |
| interp!( |
| interp9, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test $bar$foo", |
| "test yyyxxx", |
| ); |
| |
| interp!( |
| interp10, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test $ test", |
| "test $ test", |
| ); |
| |
| interp!( |
| interp11, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test ${} test", |
| "test test", |
| ); |
| |
| interp!( |
| interp12, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test ${ } test", |
| "test test", |
| ); |
| |
| interp!( |
| interp13, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test ${a b} test", |
| "test test", |
| ); |
| |
| interp!( |
| interp14, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test ${a} test", |
| "test test", |
| ); |
| |
| // This is a funny case where a braced reference is never closed, but |
| // within the unclosed braced reference, there is an unbraced reference. |
| // In this case, the braced reference is just treated literally and the |
| // unbraced reference is found. |
| interp!( |
| interp15, |
| vec![("bar", 1), ("foo", 2)], |
| vec!["", "yyy", "xxx"], |
| "test ${wat $bar ok", |
| "test ${wat yyy ok", |
| ); |
| } |