| use std::ffi::OsStr; |
| use std::str; |
| |
| use bstr::{ByteSlice, ByteVec}; |
| |
| /// A single state in the state machine used by `unescape`. |
| #[derive(Clone, Copy, Eq, PartialEq)] |
| enum State { |
| /// The state after seeing a `\`. |
| Escape, |
| /// The state after seeing a `\x`. |
| HexFirst, |
| /// The state after seeing a `\x[0-9A-Fa-f]`. |
| HexSecond(char), |
| /// Default state. |
| Literal, |
| } |
| |
| /// Escapes arbitrary bytes into a human readable string. |
| /// |
| /// This converts `\t`, `\r` and `\n` into their escaped forms. It also |
| /// converts the non-printable subset of ASCII in addition to invalid UTF-8 |
| /// bytes to hexadecimal escape sequences. Everything else is left as is. |
| /// |
| /// The dual of this routine is [`unescape`](fn.unescape.html). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to convert a byte string that contains a `\n` and |
| /// invalid UTF-8 bytes into a `String`. |
| /// |
| /// Pay special attention to the use of raw strings. That is, `r"\n"` is |
| /// equivalent to `"\\n"`. |
| /// |
| /// ``` |
| /// use grep_cli::escape; |
| /// |
| /// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz")); |
| /// ``` |
| pub fn escape(bytes: &[u8]) -> String { |
| let mut escaped = String::new(); |
| for (s, e, ch) in bytes.char_indices() { |
| if ch == '\u{FFFD}' { |
| for b in bytes[s..e].bytes() { |
| escape_byte(b, &mut escaped); |
| } |
| } else { |
| escape_char(ch, &mut escaped); |
| } |
| } |
| escaped |
| } |
| |
| /// Escapes an OS string into a human readable string. |
| /// |
| /// This is like [`escape`](fn.escape.html), but accepts an OS string. |
| pub fn escape_os(string: &OsStr) -> String { |
| escape(Vec::from_os_str_lossy(string).as_bytes()) |
| } |
| |
| /// Unescapes a string. |
| /// |
| /// It supports a limited set of escape sequences: |
| /// |
| /// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes. |
| /// * `\xZZ` hexadecimal escapes are mapped to their byte. |
| /// |
| /// Everything else is left as is, including non-hexadecimal escapes like |
| /// `\xGG`. |
| /// |
| /// This is useful when it is desirable for a command line argument to be |
| /// capable of specifying arbitrary bytes or otherwise make it easier to |
| /// specify non-printable characters. |
| /// |
| /// The dual of this routine is [`escape`](fn.escape.html). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to convert an escaped string (which is valid UTF-8) |
| /// into a corresponding sequence of bytes. Each escape sequence is mapped to |
| /// its bytes, which may include invalid UTF-8. |
| /// |
| /// Pay special attention to the use of raw strings. That is, `r"\n"` is |
| /// equivalent to `"\\n"`. |
| /// |
| /// ``` |
| /// use grep_cli::unescape; |
| /// |
| /// assert_eq!(&b"foo\nbar\xFFbaz"[..], &*unescape(r"foo\nbar\xFFbaz")); |
| /// ``` |
| pub fn unescape(s: &str) -> Vec<u8> { |
| use self::State::*; |
| |
| let mut bytes = vec![]; |
| let mut state = Literal; |
| for c in s.chars() { |
| match state { |
| Escape => match c { |
| '\\' => { |
| bytes.push(b'\\'); |
| state = Literal; |
| } |
| 'n' => { |
| bytes.push(b'\n'); |
| state = Literal; |
| } |
| 'r' => { |
| bytes.push(b'\r'); |
| state = Literal; |
| } |
| 't' => { |
| bytes.push(b'\t'); |
| state = Literal; |
| } |
| 'x' => { |
| state = HexFirst; |
| } |
| c => { |
| bytes.extend(format!(r"\{}", c).into_bytes()); |
| state = Literal; |
| } |
| }, |
| HexFirst => match c { |
| '0'..='9' | 'A'..='F' | 'a'..='f' => { |
| state = HexSecond(c); |
| } |
| c => { |
| bytes.extend(format!(r"\x{}", c).into_bytes()); |
| state = Literal; |
| } |
| }, |
| HexSecond(first) => match c { |
| '0'..='9' | 'A'..='F' | 'a'..='f' => { |
| let ordinal = format!("{}{}", first, c); |
| let byte = u8::from_str_radix(&ordinal, 16).unwrap(); |
| bytes.push(byte); |
| state = Literal; |
| } |
| c => { |
| let original = format!(r"\x{}{}", first, c); |
| bytes.extend(original.into_bytes()); |
| state = Literal; |
| } |
| }, |
| Literal => match c { |
| '\\' => { |
| state = Escape; |
| } |
| c => { |
| bytes.extend(c.to_string().as_bytes()); |
| } |
| }, |
| } |
| } |
| match state { |
| Escape => bytes.push(b'\\'), |
| HexFirst => bytes.extend(b"\\x"), |
| HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), |
| Literal => {} |
| } |
| bytes |
| } |
| |
| /// Unescapes an OS string. |
| /// |
| /// This is like [`unescape`](fn.unescape.html), but accepts an OS string. |
| /// |
| /// Note that this first lossily decodes the given OS string as UTF-8. That |
| /// is, an escaped string (the thing given) should be valid UTF-8. |
| pub fn unescape_os(string: &OsStr) -> Vec<u8> { |
| unescape(&string.to_string_lossy()) |
| } |
| |
| /// Adds the given codepoint to the given string, escaping it if necessary. |
| fn escape_char(cp: char, into: &mut String) { |
| if cp.is_ascii() { |
| escape_byte(cp as u8, into); |
| } else { |
| into.push(cp); |
| } |
| } |
| |
| /// Adds the given byte to the given string, escaping it if necessary. |
| fn escape_byte(byte: u8, into: &mut String) { |
| match byte { |
| 0x21..=0x5B | 0x5D..=0x7D => into.push(byte as char), |
| b'\n' => into.push_str(r"\n"), |
| b'\r' => into.push_str(r"\r"), |
| b'\t' => into.push_str(r"\t"), |
| b'\\' => into.push_str(r"\\"), |
| _ => into.push_str(&format!(r"\x{:02X}", byte)), |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::{escape, unescape}; |
| |
| fn b(bytes: &'static [u8]) -> Vec<u8> { |
| bytes.to_vec() |
| } |
| |
| #[test] |
| fn empty() { |
| assert_eq!(b(b""), unescape(r"")); |
| assert_eq!(r"", escape(b"")); |
| } |
| |
| #[test] |
| fn backslash() { |
| assert_eq!(b(b"\\"), unescape(r"\\")); |
| assert_eq!(r"\\", escape(b"\\")); |
| } |
| |
| #[test] |
| fn nul() { |
| assert_eq!(b(b"\x00"), unescape(r"\x00")); |
| assert_eq!(r"\x00", escape(b"\x00")); |
| } |
| |
| #[test] |
| fn nl() { |
| assert_eq!(b(b"\n"), unescape(r"\n")); |
| assert_eq!(r"\n", escape(b"\n")); |
| } |
| |
| #[test] |
| fn tab() { |
| assert_eq!(b(b"\t"), unescape(r"\t")); |
| assert_eq!(r"\t", escape(b"\t")); |
| } |
| |
| #[test] |
| fn carriage() { |
| assert_eq!(b(b"\r"), unescape(r"\r")); |
| assert_eq!(r"\r", escape(b"\r")); |
| } |
| |
| #[test] |
| fn nothing_simple() { |
| assert_eq!(b(b"\\a"), unescape(r"\a")); |
| assert_eq!(b(b"\\a"), unescape(r"\\a")); |
| assert_eq!(r"\\a", escape(b"\\a")); |
| } |
| |
| #[test] |
| fn nothing_hex0() { |
| assert_eq!(b(b"\\x"), unescape(r"\x")); |
| assert_eq!(b(b"\\x"), unescape(r"\\x")); |
| assert_eq!(r"\\x", escape(b"\\x")); |
| } |
| |
| #[test] |
| fn nothing_hex1() { |
| assert_eq!(b(b"\\xz"), unescape(r"\xz")); |
| assert_eq!(b(b"\\xz"), unescape(r"\\xz")); |
| assert_eq!(r"\\xz", escape(b"\\xz")); |
| } |
| |
| #[test] |
| fn nothing_hex2() { |
| assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); |
| assert_eq!(b(b"\\xzz"), unescape(r"\\xzz")); |
| assert_eq!(r"\\xzz", escape(b"\\xzz")); |
| } |
| |
| #[test] |
| fn invalid_utf8() { |
| assert_eq!(r"\xFF", escape(b"\xFF")); |
| assert_eq!(r"a\xFFb", escape(b"a\xFFb")); |
| } |
| } |