Frederick Mayle | 7bad3ce | 2022-10-27 16:29:18 -0700 | [diff] [blame] | 1 | use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::hex_digit_value}; |
| 2 | |
| 3 | |
| 4 | /// Must start with `\` |
| 5 | pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> { |
| 6 | let first = input.as_bytes().get(1) |
| 7 | .ok_or(perr(offset, UnterminatedEscape))?; |
| 8 | let out = match first { |
| 9 | // Quote escapes |
| 10 | b'\'' => (E::from_byte(b'\''), 2), |
| 11 | b'"' => (E::from_byte(b'"'), 2), |
| 12 | |
| 13 | // Ascii escapes |
| 14 | b'n' => (E::from_byte(b'\n'), 2), |
| 15 | b'r' => (E::from_byte(b'\r'), 2), |
| 16 | b't' => (E::from_byte(b'\t'), 2), |
| 17 | b'\\' => (E::from_byte(b'\\'), 2), |
| 18 | b'0' => (E::from_byte(b'\0'), 2), |
| 19 | b'x' => { |
| 20 | let hex_string = input.get(2..4) |
| 21 | .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))? |
| 22 | .as_bytes(); |
| 23 | let first = hex_digit_value(hex_string[0]) |
| 24 | .ok_or(perr(offset..offset + 4, InvalidXEscape))?; |
| 25 | let second = hex_digit_value(hex_string[1]) |
| 26 | .ok_or(perr(offset..offset + 4, InvalidXEscape))?; |
| 27 | let value = second + 16 * first; |
| 28 | |
| 29 | if E::SUPPORTS_UNICODE && value > 0x7F { |
| 30 | return Err(perr(offset..offset + 4, NonAsciiXEscape)); |
| 31 | } |
| 32 | |
| 33 | (E::from_byte(value), 4) |
| 34 | }, |
| 35 | |
| 36 | // Unicode escape |
| 37 | b'u' => { |
| 38 | if !E::SUPPORTS_UNICODE { |
| 39 | return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral)); |
| 40 | } |
| 41 | |
| 42 | if input.as_bytes().get(2) != Some(&b'{') { |
| 43 | return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace)); |
| 44 | } |
| 45 | |
| 46 | let closing_pos = input.bytes().position(|b| b == b'}') |
| 47 | .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?; |
| 48 | |
| 49 | let inner = &input[3..closing_pos]; |
| 50 | if inner.as_bytes().first() == Some(&b'_') { |
| 51 | return Err(perr(4, InvalidStartOfUnicodeEscape)); |
| 52 | } |
| 53 | |
| 54 | let mut v: u32 = 0; |
| 55 | let mut digit_count = 0; |
| 56 | for (i, b) in inner.bytes().enumerate() { |
| 57 | if b == b'_'{ |
| 58 | continue; |
| 59 | } |
| 60 | |
| 61 | let digit = hex_digit_value(b) |
| 62 | .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?; |
| 63 | |
| 64 | if digit_count == 6 { |
| 65 | return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape)); |
| 66 | } |
| 67 | digit_count += 1; |
| 68 | v = 16 * v + digit as u32; |
| 69 | } |
| 70 | |
| 71 | let c = std::char::from_u32(v) |
| 72 | .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?; |
| 73 | |
| 74 | (E::from_char(c), closing_pos + 1) |
| 75 | } |
| 76 | |
| 77 | _ => return Err(perr(offset..offset + 2, UnknownEscape)), |
| 78 | }; |
| 79 | |
| 80 | Ok(out) |
| 81 | } |
| 82 | |
| 83 | pub(crate) trait Escapee: Into<char> { |
| 84 | const SUPPORTS_UNICODE: bool; |
| 85 | fn from_byte(b: u8) -> Self; |
| 86 | fn from_char(c: char) -> Self; |
| 87 | } |
| 88 | |
| 89 | impl Escapee for u8 { |
| 90 | const SUPPORTS_UNICODE: bool = false; |
| 91 | fn from_byte(b: u8) -> Self { |
| 92 | b |
| 93 | } |
| 94 | fn from_char(_: char) -> Self { |
| 95 | panic!("bug: `<u8 as Escapee>::from_char` was called"); |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | impl Escapee for char { |
| 100 | const SUPPORTS_UNICODE: bool = true; |
| 101 | fn from_byte(b: u8) -> Self { |
| 102 | b.into() |
| 103 | } |
| 104 | fn from_char(c: char) -> Self { |
| 105 | c |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | /// Checks whether the character is skipped after a string continue start |
| 110 | /// (unescaped backlash followed by `\n`). |
Jeff Vander Stoep | d362f28 | 2023-02-03 10:26:47 +0100 | [diff] [blame] | 111 | fn is_string_continue_skipable_whitespace(b: u8) -> bool { |
Frederick Mayle | 7bad3ce | 2022-10-27 16:29:18 -0700 | [diff] [blame] | 112 | b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' |
| 113 | } |
| 114 | |
| 115 | /// Unescapes a whole string or byte string. |
Jeff Vander Stoep | d362f28 | 2023-02-03 10:26:47 +0100 | [diff] [blame] | 116 | #[inline(never)] |
Frederick Mayle | 7bad3ce | 2022-10-27 16:29:18 -0700 | [diff] [blame] | 117 | pub(crate) fn unescape_string<E: Escapee>( |
| 118 | input: &str, |
| 119 | offset: usize, |
| 120 | ) -> Result<Option<String>, ParseError> { |
| 121 | let mut i = offset; |
| 122 | let mut end_last_escape = offset; |
| 123 | let mut value = String::new(); |
| 124 | while i < input.len() - 1 { |
| 125 | match input.as_bytes()[i] { |
| 126 | // Handle "string continue". |
| 127 | b'\\' if input.as_bytes()[i + 1] == b'\n' => { |
| 128 | value.push_str(&input[end_last_escape..i]); |
| 129 | |
| 130 | // Find the first non-whitespace character. |
| 131 | let end_escape = input[i + 2..].bytes() |
| 132 | .position(|b| !is_string_continue_skipable_whitespace(b)) |
| 133 | .ok_or(perr(None, UnterminatedString))?; |
| 134 | |
| 135 | i += 2 + end_escape; |
| 136 | end_last_escape = i; |
| 137 | } |
| 138 | b'\\' => { |
| 139 | let (c, len) = unescape::<E>(&input[i..input.len() - 1], i)?; |
| 140 | value.push_str(&input[end_last_escape..i]); |
| 141 | value.push(c.into()); |
| 142 | i += len; |
| 143 | end_last_escape = i; |
| 144 | } |
| 145 | b'\r' => { |
| 146 | if input.as_bytes()[i + 1] == b'\n' { |
| 147 | value.push_str(&input[end_last_escape..i]); |
| 148 | value.push('\n'); |
| 149 | i += 2; |
| 150 | end_last_escape = i; |
| 151 | } else { |
| 152 | return Err(perr(i, IsolatedCr)) |
| 153 | } |
| 154 | } |
| 155 | b'"' => return Err(perr(i + 1..input.len(), UnexpectedChar)), |
| 156 | b if !E::SUPPORTS_UNICODE && !b.is_ascii() |
| 157 | => return Err(perr(i, NonAsciiInByteLiteral)), |
| 158 | _ => i += 1, |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | if input.as_bytes()[input.len() - 1] != b'"' || input.len() == offset { |
| 163 | return Err(perr(None, UnterminatedString)); |
| 164 | } |
| 165 | |
| 166 | // `value` is only empty if there was no escape in the input string |
| 167 | // (with the special case of the input being empty). This means the |
| 168 | // string value basically equals the input, so we store `None`. |
| 169 | let value = if value.is_empty() { |
| 170 | None |
| 171 | } else { |
| 172 | // There was an escape in the string, so we need to push the |
| 173 | // remaining unescaped part of the string still. |
| 174 | value.push_str(&input[end_last_escape..input.len() - 1]); |
| 175 | Some(value) |
| 176 | }; |
| 177 | |
| 178 | Ok(value) |
| 179 | } |
| 180 | |
| 181 | /// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to |
| 182 | /// just `\n` sequences. Returns an optional new string (if the input contained |
| 183 | /// any `\r\n`) and the number of hashes used by the literal. |
Jeff Vander Stoep | d362f28 | 2023-02-03 10:26:47 +0100 | [diff] [blame] | 184 | #[inline(never)] |
Frederick Mayle | 7bad3ce | 2022-10-27 16:29:18 -0700 | [diff] [blame] | 185 | pub(crate) fn scan_raw_string<E: Escapee>( |
| 186 | input: &str, |
| 187 | offset: usize, |
| 188 | ) -> Result<(Option<String>, u32), ParseError> { |
| 189 | // Raw string literal |
| 190 | let num_hashes = input[offset..].bytes().position(|b| b != b'#') |
| 191 | .ok_or(perr(None, InvalidLiteral))?; |
| 192 | |
| 193 | if input.as_bytes().get(offset + num_hashes) != Some(&b'"') { |
| 194 | return Err(perr(None, InvalidLiteral)); |
| 195 | } |
| 196 | let start_inner = offset + num_hashes + 1; |
| 197 | let hashes = &input[offset..num_hashes + offset]; |
| 198 | |
| 199 | let mut closing_quote_pos = None; |
| 200 | let mut i = start_inner; |
| 201 | let mut end_last_escape = start_inner; |
| 202 | let mut value = String::new(); |
| 203 | while i < input.len() { |
| 204 | let b = input.as_bytes()[i]; |
| 205 | if b == b'"' && input[i + 1..].starts_with(hashes) { |
| 206 | closing_quote_pos = Some(i); |
| 207 | break; |
| 208 | } |
| 209 | |
| 210 | if b == b'\r' { |
| 211 | // Convert `\r\n` into `\n`. This is currently not well documented |
| 212 | // in the Rust reference, but is done even for raw strings. That's |
| 213 | // because rustc simply converts all line endings when reading |
| 214 | // source files. |
| 215 | if input.as_bytes().get(i + 1) == Some(&b'\n') { |
| 216 | value.push_str(&input[end_last_escape..i]); |
| 217 | value.push('\n'); |
| 218 | i += 2; |
| 219 | end_last_escape = i; |
| 220 | continue; |
| 221 | } else if E::SUPPORTS_UNICODE { |
| 222 | // If no \n follows the \r and we are scanning a raw string |
| 223 | // (not raw byte string), we error. |
| 224 | return Err(perr(i, IsolatedCr)) |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | if !E::SUPPORTS_UNICODE { |
| 229 | if !b.is_ascii() { |
| 230 | return Err(perr(i, NonAsciiInByteLiteral)); |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | i += 1; |
| 235 | } |
| 236 | |
| 237 | let closing_quote_pos = closing_quote_pos |
| 238 | .ok_or(perr(None, UnterminatedRawString))?; |
| 239 | |
| 240 | if closing_quote_pos + num_hashes != input.len() - 1 { |
| 241 | return Err(perr(closing_quote_pos + num_hashes + 1..input.len(), UnexpectedChar)); |
| 242 | } |
| 243 | |
| 244 | // `value` is only empty if there was no \r\n in the input string (with the |
| 245 | // special case of the input being empty). This means the string value |
| 246 | // equals the input, so we store `None`. |
| 247 | let value = if value.is_empty() { |
| 248 | None |
| 249 | } else { |
| 250 | // There was an \r\n in the string, so we need to push the remaining |
| 251 | // unescaped part of the string still. |
| 252 | value.push_str(&input[end_last_escape..closing_quote_pos]); |
| 253 | Some(value) |
| 254 | }; |
| 255 | |
| 256 | Ok((value, num_hashes as u32)) |
| 257 | } |