Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 1 | //! Low-level Rust lexer. |
| 2 | //! |
Chris Wailes | 32f7835 | 2021-07-20 14:04:55 -0700 | [diff] [blame] | 3 | //! The idea with `rustc_lexer` is to make a reusable library, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 4 | //! by separating out pure lexing and rustc-specific concerns, like spans, |
Charisee | d720b3f | 2023-03-09 17:35:07 +0000 | [diff] [blame] | 5 | //! error reporting, and interning. So, rustc_lexer operates directly on `&str`, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 6 | //! produces simple tokens which are a pair of type-tag and a bit of original text, |
| 7 | //! and does not report errors, instead storing them as flags on the token. |
| 8 | //! |
| 9 | //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax. |
Chris Wailes | 32f7835 | 2021-07-20 14:04:55 -0700 | [diff] [blame] | 10 | //! For that see [`rustc_parse::lexer`], which converts this basic token stream |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 11 | //! into wide tokens used by actual parser. |
| 12 | //! |
| 13 | //! The purpose of this crate is to convert raw sources into a labeled sequence |
| 14 | //! of well-known token types, so building an actual Rust token stream will |
| 15 | //! be easier. |
| 16 | //! |
| 17 | //! The main entity of this crate is the [`TokenKind`] enum which represents common |
| 18 | //! lexeme types. |
| 19 | //! |
Chris Wailes | 32f7835 | 2021-07-20 14:04:55 -0700 | [diff] [blame] | 20 | //! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html |
Chris Wailes | 2f380c1 | 2022-11-09 13:04:22 -0800 | [diff] [blame] | 21 | #![deny(rustc::untranslatable_diagnostic)] |
| 22 | #![deny(rustc::diagnostic_outside_of_impl)] |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 23 | // We want to be able to build this crate with a stable compiler, so no |
| 24 | // `#![feature]` attributes should be added. |
| 25 | |
| 26 | mod cursor; |
| 27 | pub mod unescape; |
| 28 | |
| 29 | #[cfg(test)] |
| 30 | mod tests; |
| 31 | |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 32 | pub use crate::cursor::Cursor; |
| 33 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 34 | use self::LiteralKind::*; |
| 35 | use self::TokenKind::*; |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 36 | use crate::cursor::EOF_CHAR; |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 37 | |
| 38 | /// Parsed token. |
| 39 | /// It doesn't contain information about data that has been parsed, |
| 40 | /// only the type of the token and its size. |
| 41 | #[derive(Debug)] |
| 42 | pub struct Token { |
| 43 | pub kind: TokenKind, |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 44 | pub len: u32, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 45 | } |
| 46 | |
| 47 | impl Token { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 48 | fn new(kind: TokenKind, len: u32) -> Token { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 49 | Token { kind, len } |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | /// Enum representing common lexeme types. |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 54 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 55 | pub enum TokenKind { |
| 56 | // Multi-char tokens: |
| 57 | /// "// comment" |
| 58 | LineComment { doc_style: Option<DocStyle> }, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 59 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 60 | /// `/* block comment */` |
| 61 | /// |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 62 | /// Block comments can be recursive, so a sequence like `/* /* */` |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 63 | /// will not be considered terminated and will result in a parsing error. |
| 64 | BlockComment { doc_style: Option<DocStyle>, terminated: bool }, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 65 | |
| 66 | /// Any whitespace character sequence. |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 67 | Whitespace, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 68 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 69 | /// "ident" or "continue" |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 70 | /// |
| 71 | /// At this step, keywords are also considered identifiers. |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 72 | Ident, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 73 | |
Chris Wailes | 356b57e | 2022-01-13 10:08:24 -0800 | [diff] [blame] | 74 | /// Like the above, but containing invalid unicode codepoints. |
| 75 | InvalidIdent, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 76 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 77 | /// "r#ident" |
| 78 | RawIdent, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 79 | |
| 80 | /// An unknown prefix, like `foo#`, `foo'`, `foo"`. |
| 81 | /// |
| 82 | /// Note that only the |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 83 | /// prefix (`foo`) is included in the token, not the separator (which is |
| 84 | /// lexed as its own distinct token). In Rust 2021 and later, reserved |
| 85 | /// prefixes are reported as errors; in earlier editions, they result in a |
| 86 | /// (allowed by default) lint, and are treated as regular identifier |
| 87 | /// tokens. |
| 88 | UnknownPrefix, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 89 | |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 90 | /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid |
| 91 | /// suffix, but may be present here on string and float literals. Users of |
| 92 | /// this type will need to check for and reject that case. |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 93 | /// |
| 94 | /// See [LiteralKind] for more details. |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 95 | Literal { kind: LiteralKind, suffix_start: u32 }, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 96 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 97 | /// "'a" |
| 98 | Lifetime { starts_with_number: bool }, |
| 99 | |
| 100 | // One-char tokens: |
| 101 | /// ";" |
| 102 | Semi, |
| 103 | /// "," |
| 104 | Comma, |
| 105 | /// "." |
| 106 | Dot, |
| 107 | /// "(" |
| 108 | OpenParen, |
| 109 | /// ")" |
| 110 | CloseParen, |
| 111 | /// "{" |
| 112 | OpenBrace, |
| 113 | /// "}" |
| 114 | CloseBrace, |
| 115 | /// "[" |
| 116 | OpenBracket, |
| 117 | /// "]" |
| 118 | CloseBracket, |
| 119 | /// "@" |
| 120 | At, |
| 121 | /// "#" |
| 122 | Pound, |
| 123 | /// "~" |
| 124 | Tilde, |
| 125 | /// "?" |
| 126 | Question, |
| 127 | /// ":" |
| 128 | Colon, |
| 129 | /// "$" |
| 130 | Dollar, |
| 131 | /// "=" |
| 132 | Eq, |
| 133 | /// "!" |
| 134 | Bang, |
| 135 | /// "<" |
| 136 | Lt, |
| 137 | /// ">" |
| 138 | Gt, |
| 139 | /// "-" |
| 140 | Minus, |
| 141 | /// "&" |
| 142 | And, |
| 143 | /// "|" |
| 144 | Or, |
| 145 | /// "+" |
| 146 | Plus, |
| 147 | /// "*" |
| 148 | Star, |
| 149 | /// "/" |
| 150 | Slash, |
| 151 | /// "^" |
| 152 | Caret, |
| 153 | /// "%" |
| 154 | Percent, |
| 155 | |
| 156 | /// Unknown token, not expected by the lexer, e.g. "№" |
| 157 | Unknown, |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 158 | |
| 159 | /// End of input. |
| 160 | Eof, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 161 | } |
| 162 | |
Chris Wailes | 2f380c1 | 2022-11-09 13:04:22 -0800 | [diff] [blame] | 163 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 164 | pub enum DocStyle { |
| 165 | Outer, |
| 166 | Inner, |
| 167 | } |
| 168 | |
Chris Wailes | 5c0824a | 2023-04-24 16:30:59 -0700 | [diff] [blame] | 169 | /// Enum representing the literal types supported by the lexer. |
| 170 | /// |
| 171 | /// Note that the suffix is *not* considered when deciding the `LiteralKind` in |
| 172 | /// this type. This means that float literals like `1f32` are classified by this |
| 173 | /// type as `Int`. (Compare against `rustc_ast::token::LitKind` and |
| 174 | /// `rustc_ast::ast::LitKind`). |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 175 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
| 176 | pub enum LiteralKind { |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 177 | /// "12_u8", "0o100", "0b120i99", "1f32". |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 178 | Int { base: Base, empty_int: bool }, |
Chris Wailes | 5c0824a | 2023-04-24 16:30:59 -0700 | [diff] [blame] | 179 | /// "12.34f32", "1e3", but not "1f32". |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 180 | Float { base: Base, empty_exponent: bool }, |
| 181 | /// "'a'", "'\\'", "'''", "';" |
| 182 | Char { terminated: bool }, |
| 183 | /// "b'a'", "b'\\'", "b'''", "b';" |
| 184 | Byte { terminated: bool }, |
| 185 | /// ""abc"", ""abc" |
| 186 | Str { terminated: bool }, |
| 187 | /// "b"abc"", "b"abc" |
| 188 | ByteStr { terminated: bool }, |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 189 | /// `c"abc"`, `c"abc` |
| 190 | CStr { terminated: bool }, |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 191 | /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates |
| 192 | /// an invalid literal. |
| 193 | RawStr { n_hashes: Option<u8> }, |
| 194 | /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` |
| 195 | /// indicates an invalid literal. |
| 196 | RawByteStr { n_hashes: Option<u8> }, |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 197 | /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal. |
| 198 | RawCStr { n_hashes: Option<u8> }, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 199 | } |
| 200 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 201 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
| 202 | pub enum RawStrError { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 203 | /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##` |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 204 | InvalidStarter { bad_char: char }, |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 205 | /// The string was not terminated, e.g. `r###"abcde"##`. |
| 206 | /// `possible_terminator_offset` is the number of characters after `r` or |
| 207 | /// `br` where they may have intended to terminate it. |
| 208 | NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> }, |
Charisee | 341341c | 2022-05-20 05:14:50 +0000 | [diff] [blame] | 209 | /// More than 255 `#`s exist. |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 210 | TooManyDelimiters { found: u32 }, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 211 | } |
| 212 | |
| 213 | /// Base of numeric literal encoding according to its prefix. |
| 214 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
| 215 | pub enum Base { |
| 216 | /// Literal starts with "0b". |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 217 | Binary = 2, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 218 | /// Literal starts with "0o". |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 219 | Octal = 8, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 220 | /// Literal doesn't contain a prefix. |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 221 | Decimal = 10, |
| 222 | /// Literal starts with "0x". |
| 223 | Hexadecimal = 16, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 224 | } |
| 225 | |
| 226 | /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", |
| 227 | /// but shebang isn't a part of rust syntax. |
| 228 | pub fn strip_shebang(input: &str) -> Option<usize> { |
| 229 | // Shebang must start with `#!` literally, without any preceding whitespace. |
| 230 | // For simplicity we consider any line starting with `#!` a shebang, |
| 231 | // regardless of restrictions put on shebangs by specific platforms. |
| 232 | if let Some(input_tail) = input.strip_prefix("#!") { |
| 233 | // Ok, this is a shebang but if the next non-whitespace token is `[`, |
| 234 | // then it may be valid Rust code, so consider it Rust code. |
| 235 | let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| { |
| 236 | !matches!( |
| 237 | tok, |
| 238 | TokenKind::Whitespace |
| 239 | | TokenKind::LineComment { doc_style: None } |
| 240 | | TokenKind::BlockComment { doc_style: None, .. } |
| 241 | ) |
| 242 | }); |
| 243 | if next_non_whitespace_token != Some(TokenKind::OpenBracket) { |
| 244 | // No other choice than to consider this a shebang. |
| 245 | return Some(2 + input_tail.lines().next().unwrap_or_default().len()); |
| 246 | } |
| 247 | } |
| 248 | None |
| 249 | } |
| 250 | |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 251 | /// Validates a raw string literal. Used for getting more information about a |
| 252 | /// problem with a `RawStr`/`RawByteStr` with a `None` field. |
| 253 | #[inline] |
| 254 | pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> { |
| 255 | debug_assert!(!input.is_empty()); |
| 256 | let mut cursor = Cursor::new(input); |
| 257 | // Move past the leading `r` or `br`. |
| 258 | for _ in 0..prefix_len { |
| 259 | cursor.bump().unwrap(); |
| 260 | } |
| 261 | cursor.raw_double_quoted_string(prefix_len).map(|_| ()) |
| 262 | } |
| 263 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 264 | /// Creates an iterator that produces tokens from the input string. |
Charisee | 7878d54 | 2022-02-24 18:21:36 +0000 | [diff] [blame] | 265 | pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { |
| 266 | let mut cursor = Cursor::new(input); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 267 | std::iter::from_fn(move || { |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 268 | let token = cursor.advance_token(); |
| 269 | if token.kind != TokenKind::Eof { Some(token) } else { None } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 270 | }) |
| 271 | } |
| 272 | |
| 273 | /// True if `c` is considered a whitespace according to Rust language definition. |
| 274 | /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) |
| 275 | /// for definitions of these classes. |
| 276 | pub fn is_whitespace(c: char) -> bool { |
| 277 | // This is Pattern_White_Space. |
| 278 | // |
| 279 | // Note that this set is stable (ie, it doesn't change with different |
| 280 | // Unicode versions), so it's ok to just hard-code the values. |
| 281 | |
Thiébaud Weksteen | 5bd94c1 | 2021-01-06 15:18:42 +0100 | [diff] [blame] | 282 | matches!( |
| 283 | c, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 284 | // Usual ASCII suspects |
Thiébaud Weksteen | 5bd94c1 | 2021-01-06 15:18:42 +0100 | [diff] [blame] | 285 | '\u{0009}' // \t |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 286 | | '\u{000A}' // \n |
| 287 | | '\u{000B}' // vertical tab |
| 288 | | '\u{000C}' // form feed |
| 289 | | '\u{000D}' // \r |
| 290 | | '\u{0020}' // space |
| 291 | |
| 292 | // NEXT LINE from latin1 |
| 293 | | '\u{0085}' |
| 294 | |
| 295 | // Bidi markers |
| 296 | | '\u{200E}' // LEFT-TO-RIGHT MARK |
| 297 | | '\u{200F}' // RIGHT-TO-LEFT MARK |
| 298 | |
| 299 | // Dedicated whitespace characters from Unicode |
| 300 | | '\u{2028}' // LINE SEPARATOR |
| 301 | | '\u{2029}' // PARAGRAPH SEPARATOR |
Thiébaud Weksteen | 5bd94c1 | 2021-01-06 15:18:42 +0100 | [diff] [blame] | 302 | ) |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 303 | } |
| 304 | |
| 305 | /// True if `c` is valid as a first character of an identifier. |
| 306 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
| 307 | /// a formal definition of valid identifier name. |
| 308 | pub fn is_id_start(c: char) -> bool { |
| 309 | // This is XID_Start OR '_' (which formally is not a XID_Start). |
Chris Wailes | bcf972c | 2021-10-21 11:03:28 -0700 | [diff] [blame] | 310 | c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 311 | } |
| 312 | |
| 313 | /// True if `c` is valid as a non-first character of an identifier. |
| 314 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
| 315 | /// a formal definition of valid identifier name. |
| 316 | pub fn is_id_continue(c: char) -> bool { |
Chris Wailes | bcf972c | 2021-10-21 11:03:28 -0700 | [diff] [blame] | 317 | unicode_xid::UnicodeXID::is_xid_continue(c) |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 318 | } |
| 319 | |
| 320 | /// The passed string is lexically an identifier. |
| 321 | pub fn is_ident(string: &str) -> bool { |
| 322 | let mut chars = string.chars(); |
| 323 | if let Some(start) = chars.next() { |
| 324 | is_id_start(start) && chars.all(is_id_continue) |
| 325 | } else { |
| 326 | false |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | impl Cursor<'_> { |
| 331 | /// Parses a token from the input string. |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 332 | pub fn advance_token(&mut self) -> Token { |
| 333 | let first_char = match self.bump() { |
| 334 | Some(c) => c, |
| 335 | None => return Token::new(TokenKind::Eof, 0), |
| 336 | }; |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 337 | let token_kind = match first_char { |
| 338 | // Slash, comment or block comment. |
| 339 | '/' => match self.first() { |
| 340 | '/' => self.line_comment(), |
| 341 | '*' => self.block_comment(), |
| 342 | _ => Slash, |
| 343 | }, |
| 344 | |
| 345 | // Whitespace sequence. |
| 346 | c if is_whitespace(c) => self.whitespace(), |
| 347 | |
| 348 | // Raw identifier, raw string literal or identifier. |
| 349 | 'r' => match (self.first(), self.second()) { |
| 350 | ('#', c1) if is_id_start(c1) => self.raw_ident(), |
| 351 | ('#', _) | ('"', _) => { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 352 | let res = self.raw_double_quoted_string(1); |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 353 | let suffix_start = self.pos_within_token(); |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 354 | if res.is_ok() { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 355 | self.eat_literal_suffix(); |
| 356 | } |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 357 | let kind = RawStr { n_hashes: res.ok() }; |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 358 | Literal { kind, suffix_start } |
| 359 | } |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 360 | _ => self.ident_or_unknown_prefix(), |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 361 | }, |
| 362 | |
| 363 | // Byte literal, byte string literal, raw byte string literal or identifier. |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 364 | 'b' => self.c_or_byte_string( |
| 365 | |terminated| ByteStr { terminated }, |
| 366 | |n_hashes| RawByteStr { n_hashes }, |
| 367 | Some(|terminated| Byte { terminated }), |
| 368 | ), |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 369 | |
| 370 | // Identifier (this should be checked after other variant that can |
| 371 | // start as identifier). |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 372 | c if is_id_start(c) => self.ident_or_unknown_prefix(), |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 373 | |
| 374 | // Numeric literal. |
| 375 | c @ '0'..='9' => { |
| 376 | let literal_kind = self.number(c); |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 377 | let suffix_start = self.pos_within_token(); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 378 | self.eat_literal_suffix(); |
| 379 | TokenKind::Literal { kind: literal_kind, suffix_start } |
| 380 | } |
| 381 | |
| 382 | // One-symbol tokens. |
| 383 | ';' => Semi, |
| 384 | ',' => Comma, |
| 385 | '.' => Dot, |
| 386 | '(' => OpenParen, |
| 387 | ')' => CloseParen, |
| 388 | '{' => OpenBrace, |
| 389 | '}' => CloseBrace, |
| 390 | '[' => OpenBracket, |
| 391 | ']' => CloseBracket, |
| 392 | '@' => At, |
| 393 | '#' => Pound, |
| 394 | '~' => Tilde, |
| 395 | '?' => Question, |
| 396 | ':' => Colon, |
| 397 | '$' => Dollar, |
| 398 | '=' => Eq, |
| 399 | '!' => Bang, |
| 400 | '<' => Lt, |
| 401 | '>' => Gt, |
| 402 | '-' => Minus, |
| 403 | '&' => And, |
| 404 | '|' => Or, |
| 405 | '+' => Plus, |
| 406 | '*' => Star, |
| 407 | '^' => Caret, |
| 408 | '%' => Percent, |
| 409 | |
| 410 | // Lifetime or character literal. |
| 411 | '\'' => self.lifetime_or_char(), |
| 412 | |
| 413 | // String literal. |
| 414 | '"' => { |
| 415 | let terminated = self.double_quoted_string(); |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 416 | let suffix_start = self.pos_within_token(); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 417 | if terminated { |
| 418 | self.eat_literal_suffix(); |
| 419 | } |
| 420 | let kind = Str { terminated }; |
| 421 | Literal { kind, suffix_start } |
| 422 | } |
Chris Wailes | 356b57e | 2022-01-13 10:08:24 -0800 | [diff] [blame] | 423 | // Identifier starting with an emoji. Only lexed for graceful error recovery. |
| 424 | c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { |
| 425 | self.fake_ident_or_unknown_prefix() |
| 426 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 427 | _ => Unknown, |
| 428 | }; |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 429 | let res = Token::new(token_kind, self.pos_within_token()); |
| 430 | self.reset_pos_within_token(); |
| 431 | res |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 432 | } |
| 433 | |
| 434 | fn line_comment(&mut self) -> TokenKind { |
| 435 | debug_assert!(self.prev() == '/' && self.first() == '/'); |
| 436 | self.bump(); |
| 437 | |
| 438 | let doc_style = match self.first() { |
| 439 | // `//!` is an inner line doc comment. |
| 440 | '!' => Some(DocStyle::Inner), |
| 441 | // `////` (more than 3 slashes) is not considered a doc comment. |
| 442 | '/' if self.second() != '/' => Some(DocStyle::Outer), |
| 443 | _ => None, |
| 444 | }; |
| 445 | |
| 446 | self.eat_while(|c| c != '\n'); |
| 447 | LineComment { doc_style } |
| 448 | } |
| 449 | |
| 450 | fn block_comment(&mut self) -> TokenKind { |
| 451 | debug_assert!(self.prev() == '/' && self.first() == '*'); |
| 452 | self.bump(); |
| 453 | |
| 454 | let doc_style = match self.first() { |
| 455 | // `/*!` is an inner block doc comment. |
| 456 | '!' => Some(DocStyle::Inner), |
| 457 | // `/***` (more than 2 stars) is not considered a doc comment. |
| 458 | // `/**/` is not considered a doc comment. |
| 459 | '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), |
| 460 | _ => None, |
| 461 | }; |
| 462 | |
| 463 | let mut depth = 1usize; |
| 464 | while let Some(c) = self.bump() { |
| 465 | match c { |
| 466 | '/' if self.first() == '*' => { |
| 467 | self.bump(); |
| 468 | depth += 1; |
| 469 | } |
| 470 | '*' if self.first() == '/' => { |
| 471 | self.bump(); |
| 472 | depth -= 1; |
| 473 | if depth == 0 { |
| 474 | // This block comment is closed, so for a construction like "/* */ */" |
| 475 | // there will be a successfully parsed block comment "/* */" |
| 476 | // and " */" will be processed separately. |
| 477 | break; |
| 478 | } |
| 479 | } |
| 480 | _ => (), |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | BlockComment { doc_style, terminated: depth == 0 } |
| 485 | } |
| 486 | |
| 487 | fn whitespace(&mut self) -> TokenKind { |
| 488 | debug_assert!(is_whitespace(self.prev())); |
| 489 | self.eat_while(is_whitespace); |
| 490 | Whitespace |
| 491 | } |
| 492 | |
| 493 | fn raw_ident(&mut self) -> TokenKind { |
| 494 | debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); |
| 495 | // Eat "#" symbol. |
| 496 | self.bump(); |
| 497 | // Eat the identifier part of RawIdent. |
| 498 | self.eat_identifier(); |
| 499 | RawIdent |
| 500 | } |
| 501 | |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 502 | fn ident_or_unknown_prefix(&mut self) -> TokenKind { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 503 | debug_assert!(is_id_start(self.prev())); |
| 504 | // Start is already eaten, eat the rest of identifier. |
| 505 | self.eat_while(is_id_continue); |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 506 | // Known prefixes must have been handled earlier. So if |
Chris Wailes | bcf972c | 2021-10-21 11:03:28 -0700 | [diff] [blame] | 507 | // we see a prefix here, it is definitely an unknown prefix. |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 508 | match self.first() { |
| 509 | '#' | '"' | '\'' => UnknownPrefix, |
Chris Wailes | 356b57e | 2022-01-13 10:08:24 -0800 | [diff] [blame] | 510 | c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { |
| 511 | self.fake_ident_or_unknown_prefix() |
| 512 | } |
Chris Wailes | 54272ac | 2021-09-09 16:08:13 -0700 | [diff] [blame] | 513 | _ => Ident, |
| 514 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 515 | } |
| 516 | |
Chris Wailes | 356b57e | 2022-01-13 10:08:24 -0800 | [diff] [blame] | 517 | fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { |
| 518 | // Start is already eaten, eat the rest of identifier. |
| 519 | self.eat_while(|c| { |
| 520 | unicode_xid::UnicodeXID::is_xid_continue(c) |
| 521 | || (!c.is_ascii() && unic_emoji_char::is_emoji(c)) |
| 522 | || c == '\u{200d}' |
| 523 | }); |
| 524 | // Known prefixes must have been handled earlier. So if |
| 525 | // we see a prefix here, it is definitely an unknown prefix. |
| 526 | match self.first() { |
| 527 | '#' | '"' | '\'' => UnknownPrefix, |
| 528 | _ => InvalidIdent, |
| 529 | } |
| 530 | } |
| 531 | |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 532 | fn c_or_byte_string( |
| 533 | &mut self, |
| 534 | mk_kind: impl FnOnce(bool) -> LiteralKind, |
| 535 | mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind, |
| 536 | single_quoted: Option<fn(bool) -> LiteralKind>, |
| 537 | ) -> TokenKind { |
| 538 | match (self.first(), self.second(), single_quoted) { |
| 539 | ('\'', _, Some(mk_kind)) => { |
| 540 | self.bump(); |
| 541 | let terminated = self.single_quoted_string(); |
| 542 | let suffix_start = self.pos_within_token(); |
| 543 | if terminated { |
| 544 | self.eat_literal_suffix(); |
| 545 | } |
| 546 | let kind = mk_kind(terminated); |
| 547 | Literal { kind, suffix_start } |
| 548 | } |
| 549 | ('"', _, _) => { |
| 550 | self.bump(); |
| 551 | let terminated = self.double_quoted_string(); |
| 552 | let suffix_start = self.pos_within_token(); |
| 553 | if terminated { |
| 554 | self.eat_literal_suffix(); |
| 555 | } |
| 556 | let kind = mk_kind(terminated); |
| 557 | Literal { kind, suffix_start } |
| 558 | } |
| 559 | ('r', '"', _) | ('r', '#', _) => { |
| 560 | self.bump(); |
| 561 | let res = self.raw_double_quoted_string(2); |
| 562 | let suffix_start = self.pos_within_token(); |
| 563 | if res.is_ok() { |
| 564 | self.eat_literal_suffix(); |
| 565 | } |
| 566 | let kind = mk_kind_raw(res.ok()); |
| 567 | Literal { kind, suffix_start } |
| 568 | } |
| 569 | _ => self.ident_or_unknown_prefix(), |
| 570 | } |
| 571 | } |
| 572 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 573 | fn number(&mut self, first_digit: char) -> LiteralKind { |
| 574 | debug_assert!('0' <= self.prev() && self.prev() <= '9'); |
| 575 | let mut base = Base::Decimal; |
| 576 | if first_digit == '0' { |
| 577 | // Attempt to parse encoding base. |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 578 | match self.first() { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 579 | 'b' => { |
| 580 | base = Base::Binary; |
| 581 | self.bump(); |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 582 | if !self.eat_decimal_digits() { |
| 583 | return Int { base, empty_int: true }; |
| 584 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 585 | } |
| 586 | 'o' => { |
| 587 | base = Base::Octal; |
| 588 | self.bump(); |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 589 | if !self.eat_decimal_digits() { |
| 590 | return Int { base, empty_int: true }; |
| 591 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 592 | } |
| 593 | 'x' => { |
| 594 | base = Base::Hexadecimal; |
| 595 | self.bump(); |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 596 | if !self.eat_hexadecimal_digits() { |
| 597 | return Int { base, empty_int: true }; |
| 598 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 599 | } |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 600 | // Not a base prefix; consume additional digits. |
| 601 | '0'..='9' | '_' => { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 602 | self.eat_decimal_digits(); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 603 | } |
Chris Wailes | cd1aefd | 2023-07-13 13:36:21 -0700 | [diff] [blame^] | 604 | |
| 605 | // Also not a base prefix; nothing more to do here. |
| 606 | '.' | 'e' | 'E' => {} |
| 607 | |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 608 | // Just a 0. |
| 609 | _ => return Int { base, empty_int: false }, |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 610 | } |
| 611 | } else { |
| 612 | // No base prefix, parse number in the usual way. |
| 613 | self.eat_decimal_digits(); |
| 614 | }; |
| 615 | |
| 616 | match self.first() { |
| 617 | // Don't be greedy if this is actually an |
| 618 | // integer literal followed by field/method access or a range pattern |
| 619 | // (`0..2` and `12.foo()`) |
| 620 | '.' if self.second() != '.' && !is_id_start(self.second()) => { |
| 621 | // might have stuff after the ., and if it does, it needs to start |
| 622 | // with a number |
| 623 | self.bump(); |
| 624 | let mut empty_exponent = false; |
| 625 | if self.first().is_digit(10) { |
| 626 | self.eat_decimal_digits(); |
| 627 | match self.first() { |
| 628 | 'e' | 'E' => { |
| 629 | self.bump(); |
| 630 | empty_exponent = !self.eat_float_exponent(); |
| 631 | } |
| 632 | _ => (), |
| 633 | } |
| 634 | } |
| 635 | Float { base, empty_exponent } |
| 636 | } |
| 637 | 'e' | 'E' => { |
| 638 | self.bump(); |
| 639 | let empty_exponent = !self.eat_float_exponent(); |
| 640 | Float { base, empty_exponent } |
| 641 | } |
| 642 | _ => Int { base, empty_int: false }, |
| 643 | } |
| 644 | } |
| 645 | |
| 646 | fn lifetime_or_char(&mut self) -> TokenKind { |
| 647 | debug_assert!(self.prev() == '\''); |
| 648 | |
| 649 | let can_be_a_lifetime = if self.second() == '\'' { |
| 650 | // It's surely not a lifetime. |
| 651 | false |
| 652 | } else { |
| 653 | // If the first symbol is valid for identifier, it can be a lifetime. |
| 654 | // Also check if it's a number for a better error reporting (so '0 will |
| 655 | // be reported as invalid lifetime and not as unterminated char literal). |
| 656 | is_id_start(self.first()) || self.first().is_digit(10) |
| 657 | }; |
| 658 | |
| 659 | if !can_be_a_lifetime { |
| 660 | let terminated = self.single_quoted_string(); |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 661 | let suffix_start = self.pos_within_token(); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 662 | if terminated { |
| 663 | self.eat_literal_suffix(); |
| 664 | } |
| 665 | let kind = Char { terminated }; |
| 666 | return Literal { kind, suffix_start }; |
| 667 | } |
| 668 | |
| 669 | // Either a lifetime or a character literal with |
| 670 | // length greater than 1. |
| 671 | |
| 672 | let starts_with_number = self.first().is_digit(10); |
| 673 | |
| 674 | // Skip the literal contents. |
| 675 | // First symbol can be a number (which isn't a valid identifier start), |
| 676 | // so skip it without any checks. |
| 677 | self.bump(); |
| 678 | self.eat_while(is_id_continue); |
| 679 | |
| 680 | // Check if after skipping literal contents we've met a closing |
| 681 | // single quote (which means that user attempted to create a |
| 682 | // string with single quotes). |
| 683 | if self.first() == '\'' { |
| 684 | self.bump(); |
| 685 | let kind = Char { terminated: true }; |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 686 | Literal { kind, suffix_start: self.pos_within_token() } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 687 | } else { |
| 688 | Lifetime { starts_with_number } |
| 689 | } |
| 690 | } |
| 691 | |
| 692 | fn single_quoted_string(&mut self) -> bool { |
| 693 | debug_assert!(self.prev() == '\''); |
| 694 | // Check if it's a one-symbol literal. |
| 695 | if self.second() == '\'' && self.first() != '\\' { |
| 696 | self.bump(); |
| 697 | self.bump(); |
| 698 | return true; |
| 699 | } |
| 700 | |
| 701 | // Literal has more than one symbol. |
| 702 | |
| 703 | // Parse until either quotes are terminated or error is detected. |
| 704 | loop { |
| 705 | match self.first() { |
| 706 | // Quotes are terminated, finish parsing. |
| 707 | '\'' => { |
| 708 | self.bump(); |
| 709 | return true; |
| 710 | } |
| 711 | // Probably beginning of the comment, which we don't want to include |
| 712 | // to the error report. |
| 713 | '/' => break, |
| 714 | // Newline without following '\'' means unclosed quote, stop parsing. |
| 715 | '\n' if self.second() != '\'' => break, |
| 716 | // End of file, stop parsing. |
| 717 | EOF_CHAR if self.is_eof() => break, |
| 718 | // Escaped slash is considered one character, so bump twice. |
| 719 | '\\' => { |
| 720 | self.bump(); |
| 721 | self.bump(); |
| 722 | } |
| 723 | // Skip the character. |
| 724 | _ => { |
| 725 | self.bump(); |
| 726 | } |
| 727 | } |
| 728 | } |
| 729 | // String was not terminated. |
| 730 | false |
| 731 | } |
| 732 | |
| 733 | /// Eats double-quoted string and returns true |
| 734 | /// if string is terminated. |
| 735 | fn double_quoted_string(&mut self) -> bool { |
| 736 | debug_assert!(self.prev() == '"'); |
| 737 | while let Some(c) = self.bump() { |
| 738 | match c { |
| 739 | '"' => { |
| 740 | return true; |
| 741 | } |
| 742 | '\\' if self.first() == '\\' || self.first() == '"' => { |
| 743 | // Bump again to skip escaped character. |
| 744 | self.bump(); |
| 745 | } |
| 746 | _ => (), |
| 747 | } |
| 748 | } |
| 749 | // End of file reached. |
| 750 | false |
| 751 | } |
| 752 | |
| 753 | /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 754 | fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 755 | // Wrap the actual function to handle the error with too many hashes. |
| 756 | // This way, it eats the whole raw string. |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 757 | let n_hashes = self.raw_string_unvalidated(prefix_len)?; |
Charisee | 341341c | 2022-05-20 05:14:50 +0000 | [diff] [blame] | 758 | // Only up to 255 `#`s are allowed in raw strings |
| 759 | match u8::try_from(n_hashes) { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 760 | Ok(num) => Ok(num), |
| 761 | Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }), |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 762 | } |
| 763 | } |
| 764 | |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 765 | fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> { |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 766 | debug_assert!(self.prev() == 'r'); |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 767 | let start_pos = self.pos_within_token(); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 768 | let mut possible_terminator_offset = None; |
| 769 | let mut max_hashes = 0; |
| 770 | |
| 771 | // Count opening '#' symbols. |
Thiébaud Weksteen | 5bd94c1 | 2021-01-06 15:18:42 +0100 | [diff] [blame] | 772 | let mut eaten = 0; |
| 773 | while self.first() == '#' { |
| 774 | eaten += 1; |
| 775 | self.bump(); |
| 776 | } |
| 777 | let n_start_hashes = eaten; |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 778 | |
| 779 | // Check that string is started. |
| 780 | match self.bump() { |
| 781 | Some('"') => (), |
| 782 | c => { |
| 783 | let c = c.unwrap_or(EOF_CHAR); |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 784 | return Err(RawStrError::InvalidStarter { bad_char: c }); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 785 | } |
| 786 | } |
| 787 | |
| 788 | // Skip the string contents and on each '#' character met, check if this is |
| 789 | // a raw string termination. |
| 790 | loop { |
| 791 | self.eat_while(|c| c != '"'); |
| 792 | |
| 793 | if self.is_eof() { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 794 | return Err(RawStrError::NoTerminator { |
| 795 | expected: n_start_hashes, |
| 796 | found: max_hashes, |
| 797 | possible_terminator_offset, |
| 798 | }); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 799 | } |
| 800 | |
| 801 | // Eat closing double quote. |
| 802 | self.bump(); |
| 803 | |
| 804 | // Check that amount of closing '#' symbols |
| 805 | // is equal to the amount of opening ones. |
| 806 | // Note that this will not consume extra trailing `#` characters: |
| 807 | // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` |
| 808 | // followed by a `#` token. |
Thiébaud Weksteen | 5bd94c1 | 2021-01-06 15:18:42 +0100 | [diff] [blame] | 809 | let mut n_end_hashes = 0; |
| 810 | while self.first() == '#' && n_end_hashes < n_start_hashes { |
| 811 | n_end_hashes += 1; |
| 812 | self.bump(); |
| 813 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 814 | |
| 815 | if n_end_hashes == n_start_hashes { |
Charisee | b1d3280 | 2022-09-22 15:38:41 +0000 | [diff] [blame] | 816 | return Ok(n_start_hashes); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 817 | } else if n_end_hashes > max_hashes { |
| 818 | // Keep track of possible terminators to give a hint about |
| 819 | // where there might be a missing terminator |
| 820 | possible_terminator_offset = |
Charisee | f7ad1c4 | 2023-01-30 22:46:42 +0000 | [diff] [blame] | 821 | Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len); |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 822 | max_hashes = n_end_hashes; |
| 823 | } |
| 824 | } |
| 825 | } |
| 826 | |
| 827 | fn eat_decimal_digits(&mut self) -> bool { |
| 828 | let mut has_digits = false; |
| 829 | loop { |
| 830 | match self.first() { |
| 831 | '_' => { |
| 832 | self.bump(); |
| 833 | } |
| 834 | '0'..='9' => { |
| 835 | has_digits = true; |
| 836 | self.bump(); |
| 837 | } |
| 838 | _ => break, |
| 839 | } |
| 840 | } |
| 841 | has_digits |
| 842 | } |
| 843 | |
| 844 | fn eat_hexadecimal_digits(&mut self) -> bool { |
| 845 | let mut has_digits = false; |
| 846 | loop { |
| 847 | match self.first() { |
| 848 | '_' => { |
| 849 | self.bump(); |
| 850 | } |
| 851 | '0'..='9' | 'a'..='f' | 'A'..='F' => { |
| 852 | has_digits = true; |
| 853 | self.bump(); |
| 854 | } |
| 855 | _ => break, |
| 856 | } |
| 857 | } |
| 858 | has_digits |
| 859 | } |
| 860 | |
| 861 | /// Eats the float exponent. Returns true if at least one digit was met, |
| 862 | /// and returns false otherwise. |
| 863 | fn eat_float_exponent(&mut self) -> bool { |
| 864 | debug_assert!(self.prev() == 'e' || self.prev() == 'E'); |
| 865 | if self.first() == '-' || self.first() == '+' { |
| 866 | self.bump(); |
| 867 | } |
| 868 | self.eat_decimal_digits() |
| 869 | } |
| 870 | |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 871 | // Eats the suffix of the literal, e.g. "u8". |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 872 | fn eat_literal_suffix(&mut self) { |
| 873 | self.eat_identifier(); |
| 874 | } |
| 875 | |
Chris Wailes | 977026a | 2023-02-13 09:13:10 -0800 | [diff] [blame] | 876 | // Eats the identifier. Note: succeeds on `_`, which isn't a valid |
Charisee | d720b3f | 2023-03-09 17:35:07 +0000 | [diff] [blame] | 877 | // identifier. |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 878 | fn eat_identifier(&mut self) { |
| 879 | if !is_id_start(self.first()) { |
| 880 | return; |
| 881 | } |
| 882 | self.bump(); |
| 883 | |
| 884 | self.eat_while(is_id_continue); |
| 885 | } |
Thiébaud Weksteen | 3b664ca | 2020-11-26 14:41:59 +0100 | [diff] [blame] | 886 | } |