blob: 19b63a106f967243daa7d64f4131492a4a613c44 [file] [log] [blame]
Frederick Mayle7bad3ce2022-10-27 16:29:18 -07001use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::hex_digit_value};
2
3
4/// Must start with `\`
5pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> {
6 let first = input.as_bytes().get(1)
7 .ok_or(perr(offset, UnterminatedEscape))?;
8 let out = match first {
9 // Quote escapes
10 b'\'' => (E::from_byte(b'\''), 2),
11 b'"' => (E::from_byte(b'"'), 2),
12
13 // Ascii escapes
14 b'n' => (E::from_byte(b'\n'), 2),
15 b'r' => (E::from_byte(b'\r'), 2),
16 b't' => (E::from_byte(b'\t'), 2),
17 b'\\' => (E::from_byte(b'\\'), 2),
18 b'0' => (E::from_byte(b'\0'), 2),
19 b'x' => {
20 let hex_string = input.get(2..4)
21 .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))?
22 .as_bytes();
23 let first = hex_digit_value(hex_string[0])
24 .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
25 let second = hex_digit_value(hex_string[1])
26 .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
27 let value = second + 16 * first;
28
29 if E::SUPPORTS_UNICODE && value > 0x7F {
30 return Err(perr(offset..offset + 4, NonAsciiXEscape));
31 }
32
33 (E::from_byte(value), 4)
34 },
35
36 // Unicode escape
37 b'u' => {
38 if !E::SUPPORTS_UNICODE {
39 return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral));
40 }
41
42 if input.as_bytes().get(2) != Some(&b'{') {
43 return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace));
44 }
45
46 let closing_pos = input.bytes().position(|b| b == b'}')
47 .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?;
48
49 let inner = &input[3..closing_pos];
50 if inner.as_bytes().first() == Some(&b'_') {
51 return Err(perr(4, InvalidStartOfUnicodeEscape));
52 }
53
54 let mut v: u32 = 0;
55 let mut digit_count = 0;
56 for (i, b) in inner.bytes().enumerate() {
57 if b == b'_'{
58 continue;
59 }
60
61 let digit = hex_digit_value(b)
62 .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?;
63
64 if digit_count == 6 {
65 return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape));
66 }
67 digit_count += 1;
68 v = 16 * v + digit as u32;
69 }
70
71 let c = std::char::from_u32(v)
72 .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?;
73
74 (E::from_char(c), closing_pos + 1)
75 }
76
77 _ => return Err(perr(offset..offset + 2, UnknownEscape)),
78 };
79
80 Ok(out)
81}
82
83pub(crate) trait Escapee: Into<char> {
84 const SUPPORTS_UNICODE: bool;
85 fn from_byte(b: u8) -> Self;
86 fn from_char(c: char) -> Self;
87}
88
89impl Escapee for u8 {
90 const SUPPORTS_UNICODE: bool = false;
91 fn from_byte(b: u8) -> Self {
92 b
93 }
94 fn from_char(_: char) -> Self {
95 panic!("bug: `<u8 as Escapee>::from_char` was called");
96 }
97}
98
99impl Escapee for char {
100 const SUPPORTS_UNICODE: bool = true;
101 fn from_byte(b: u8) -> Self {
102 b.into()
103 }
104 fn from_char(c: char) -> Self {
105 c
106 }
107}
108
109/// Checks whether the character is skipped after a string continue start
110/// (unescaped backlash followed by `\n`).
Jeff Vander Stoepd362f282023-02-03 10:26:47 +0100111fn is_string_continue_skipable_whitespace(b: u8) -> bool {
Frederick Mayle7bad3ce2022-10-27 16:29:18 -0700112 b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
113}
114
115/// Unescapes a whole string or byte string.
Jeff Vander Stoepd362f282023-02-03 10:26:47 +0100116#[inline(never)]
Frederick Mayle7bad3ce2022-10-27 16:29:18 -0700117pub(crate) fn unescape_string<E: Escapee>(
118 input: &str,
119 offset: usize,
120) -> Result<Option<String>, ParseError> {
121 let mut i = offset;
122 let mut end_last_escape = offset;
123 let mut value = String::new();
124 while i < input.len() - 1 {
125 match input.as_bytes()[i] {
126 // Handle "string continue".
127 b'\\' if input.as_bytes()[i + 1] == b'\n' => {
128 value.push_str(&input[end_last_escape..i]);
129
130 // Find the first non-whitespace character.
131 let end_escape = input[i + 2..].bytes()
132 .position(|b| !is_string_continue_skipable_whitespace(b))
133 .ok_or(perr(None, UnterminatedString))?;
134
135 i += 2 + end_escape;
136 end_last_escape = i;
137 }
138 b'\\' => {
139 let (c, len) = unescape::<E>(&input[i..input.len() - 1], i)?;
140 value.push_str(&input[end_last_escape..i]);
141 value.push(c.into());
142 i += len;
143 end_last_escape = i;
144 }
145 b'\r' => {
146 if input.as_bytes()[i + 1] == b'\n' {
147 value.push_str(&input[end_last_escape..i]);
148 value.push('\n');
149 i += 2;
150 end_last_escape = i;
151 } else {
152 return Err(perr(i, IsolatedCr))
153 }
154 }
155 b'"' => return Err(perr(i + 1..input.len(), UnexpectedChar)),
156 b if !E::SUPPORTS_UNICODE && !b.is_ascii()
157 => return Err(perr(i, NonAsciiInByteLiteral)),
158 _ => i += 1,
159 }
160 }
161
162 if input.as_bytes()[input.len() - 1] != b'"' || input.len() == offset {
163 return Err(perr(None, UnterminatedString));
164 }
165
166 // `value` is only empty if there was no escape in the input string
167 // (with the special case of the input being empty). This means the
168 // string value basically equals the input, so we store `None`.
169 let value = if value.is_empty() {
170 None
171 } else {
172 // There was an escape in the string, so we need to push the
173 // remaining unescaped part of the string still.
174 value.push_str(&input[end_last_escape..input.len() - 1]);
175 Some(value)
176 };
177
178 Ok(value)
179}
180
181/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
182/// just `\n` sequences. Returns an optional new string (if the input contained
183/// any `\r\n`) and the number of hashes used by the literal.
Jeff Vander Stoepd362f282023-02-03 10:26:47 +0100184#[inline(never)]
Frederick Mayle7bad3ce2022-10-27 16:29:18 -0700185pub(crate) fn scan_raw_string<E: Escapee>(
186 input: &str,
187 offset: usize,
188) -> Result<(Option<String>, u32), ParseError> {
189 // Raw string literal
190 let num_hashes = input[offset..].bytes().position(|b| b != b'#')
191 .ok_or(perr(None, InvalidLiteral))?;
192
193 if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
194 return Err(perr(None, InvalidLiteral));
195 }
196 let start_inner = offset + num_hashes + 1;
197 let hashes = &input[offset..num_hashes + offset];
198
199 let mut closing_quote_pos = None;
200 let mut i = start_inner;
201 let mut end_last_escape = start_inner;
202 let mut value = String::new();
203 while i < input.len() {
204 let b = input.as_bytes()[i];
205 if b == b'"' && input[i + 1..].starts_with(hashes) {
206 closing_quote_pos = Some(i);
207 break;
208 }
209
210 if b == b'\r' {
211 // Convert `\r\n` into `\n`. This is currently not well documented
212 // in the Rust reference, but is done even for raw strings. That's
213 // because rustc simply converts all line endings when reading
214 // source files.
215 if input.as_bytes().get(i + 1) == Some(&b'\n') {
216 value.push_str(&input[end_last_escape..i]);
217 value.push('\n');
218 i += 2;
219 end_last_escape = i;
220 continue;
221 } else if E::SUPPORTS_UNICODE {
222 // If no \n follows the \r and we are scanning a raw string
223 // (not raw byte string), we error.
224 return Err(perr(i, IsolatedCr))
225 }
226 }
227
228 if !E::SUPPORTS_UNICODE {
229 if !b.is_ascii() {
230 return Err(perr(i, NonAsciiInByteLiteral));
231 }
232 }
233
234 i += 1;
235 }
236
237 let closing_quote_pos = closing_quote_pos
238 .ok_or(perr(None, UnterminatedRawString))?;
239
240 if closing_quote_pos + num_hashes != input.len() - 1 {
241 return Err(perr(closing_quote_pos + num_hashes + 1..input.len(), UnexpectedChar));
242 }
243
244 // `value` is only empty if there was no \r\n in the input string (with the
245 // special case of the input being empty). This means the string value
246 // equals the input, so we store `None`.
247 let value = if value.is_empty() {
248 None
249 } else {
250 // There was an \r\n in the string, so we need to push the remaining
251 // unescaped part of the string still.
252 value.push_str(&input[end_last_escape..closing_quote_pos]);
253 Some(value)
254 };
255
256 Ok((value, num_hashes as u32))
257}