Blame - compiler/rustc_lexer/src/lib.rs - toolchain/rustc

blob: 29335a8c0f4cd3db73b3daf1b2d07e2ad2370943 [file] [log] [blame]

Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	1	//! Low-level Rust lexer.
				2	//!
Chris Wailes	32f7835	2021-07-20 14:04:55 -0700	[diff] [blame]	3	//! The idea with `rustc_lexer` is to make a reusable library,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	4	//! by separating out pure lexing and rustc-specific concerns, like spans,
Charisee	d720b3f	2023-03-09 17:35:07 +0000	[diff] [blame]	5	//! error reporting, and interning. So, rustc_lexer operates directly on `&str`,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	6	//! produces simple tokens which are a pair of type-tag and a bit of original text,
				7	//! and does not report errors, instead storing them as flags on the token.
				8	//!
				9	//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
Chris Wailes	32f7835	2021-07-20 14:04:55 -0700	[diff] [blame]	10	//! For that see [`rustc_parse::lexer`], which converts this basic token stream
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	11	//! into wide tokens used by actual parser.
				12	//!
				13	//! The purpose of this crate is to convert raw sources into a labeled sequence
				14	//! of well-known token types, so building an actual Rust token stream will
				15	//! be easier.
				16	//!
				17	//! The main entity of this crate is the [`TokenKind`] enum which represents common
				18	//! lexeme types.
				19	//!
Chris Wailes	32f7835	2021-07-20 14:04:55 -0700	[diff] [blame]	20	//! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
Chris Wailes	2f380c1	2022-11-09 13:04:22 -0800	[diff] [blame]	21	#![deny(rustc::untranslatable_diagnostic)]
				22	#![deny(rustc::diagnostic_outside_of_impl)]
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	23	// We want to be able to build this crate with a stable compiler, so no
				24	// `#![feature]` attributes should be added.
				25
				26	mod cursor;
				27	pub mod unescape;
				28
				29	#[cfg(test)]
				30	mod tests;
				31
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	32	pub use crate::cursor::Cursor;
				33
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	34	use self::LiteralKind::*;
				35	use self::TokenKind::*;
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	36	use crate::cursor::EOF_CHAR;
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	37
				38	/// Parsed token.
				39	/// It doesn't contain information about data that has been parsed,
				40	/// only the type of the token and its size.
				41	#[derive(Debug)]
				42	pub struct Token {
				43	pub kind: TokenKind,
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	44	pub len: u32,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	45	}
				46
				47	impl Token {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	48	fn new(kind: TokenKind, len: u32) -> Token {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	49	Token { kind, len }
				50	}
				51	}
				52
				53	/// Enum representing common lexeme types.
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	54	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	55	pub enum TokenKind {
				56	// Multi-char tokens:
				57	/// "// comment"
				58	LineComment { doc_style: Option<DocStyle> },
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	59
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	60	/// `/* block comment */`
				61	///
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	62	/// Block comments can be recursive, so a sequence like `/* /* */`
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	63	/// will not be considered terminated and will result in a parsing error.
				64	BlockComment { doc_style: Option<DocStyle>, terminated: bool },
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	65
				66	/// Any whitespace character sequence.
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	67	Whitespace,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	68
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	69	/// "ident" or "continue"
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	70	///
				71	/// At this step, keywords are also considered identifiers.
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	72	Ident,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	73
Chris Wailes	356b57e	2022-01-13 10:08:24 -0800	[diff] [blame]	74	/// Like the above, but containing invalid unicode codepoints.
				75	InvalidIdent,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	76
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	77	/// "r#ident"
				78	RawIdent,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	79
				80	/// An unknown prefix, like `foo#`, `foo'`, `foo"`.
				81	///
				82	/// Note that only the
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	83	/// prefix (`foo`) is included in the token, not the separator (which is
				84	/// lexed as its own distinct token). In Rust 2021 and later, reserved
				85	/// prefixes are reported as errors; in earlier editions, they result in a
				86	/// (allowed by default) lint, and are treated as regular identifier
				87	/// tokens.
				88	UnknownPrefix,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	89
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	90	/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
				91	/// suffix, but may be present here on string and float literals. Users of
				92	/// this type will need to check for and reject that case.
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	93	///
				94	/// See [LiteralKind] for more details.
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	95	Literal { kind: LiteralKind, suffix_start: u32 },
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	96
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	97	/// "'a"
				98	Lifetime { starts_with_number: bool },
				99
				100	// One-char tokens:
				101	/// ";"
				102	Semi,
				103	/// ","
				104	Comma,
				105	/// "."
				106	Dot,
				107	/// "("
				108	OpenParen,
				109	/// ")"
				110	CloseParen,
				111	/// "{"
				112	OpenBrace,
				113	/// "}"
				114	CloseBrace,
				115	/// "["
				116	OpenBracket,
				117	/// "]"
				118	CloseBracket,
				119	/// "@"
				120	At,
				121	/// "#"
				122	Pound,
				123	/// "~"
				124	Tilde,
				125	/// "?"
				126	Question,
				127	/// ":"
				128	Colon,
				129	/// "$"
				130	Dollar,
				131	/// "="
				132	Eq,
				133	/// "!"
				134	Bang,
				135	/// "<"
				136	Lt,
				137	/// ">"
				138	Gt,
				139	/// "-"
				140	Minus,
				141	/// "&"
				142	And,
				143	/// "\|"
				144	Or,
				145	/// "+"
				146	Plus,
				147	/// "*"
				148	Star,
				149	/// "/"
				150	Slash,
				151	/// "^"
				152	Caret,
				153	/// "%"
				154	Percent,
				155
				156	/// Unknown token, not expected by the lexer, e.g. "№"
				157	Unknown,
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	158
				159	/// End of input.
				160	Eof,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	161	}
				162
Chris Wailes	2f380c1	2022-11-09 13:04:22 -0800	[diff] [blame]	163	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	164	pub enum DocStyle {
				165	Outer,
				166	Inner,
				167	}
				168
Chris Wailes	5c0824a	2023-04-24 16:30:59 -0700	[diff] [blame]	169	/// Enum representing the literal types supported by the lexer.
				170	///
				171	/// Note that the suffix is not considered when deciding the `LiteralKind` in
				172	/// this type. This means that float literals like `1f32` are classified by this
				173	/// type as `Int`. (Compare against `rustc_ast::token::LitKind` and
				174	/// `rustc_ast::ast::LitKind`).
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	175	#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
				176	pub enum LiteralKind {
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	177	/// "12_u8", "0o100", "0b120i99", "1f32".
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	178	Int { base: Base, empty_int: bool },
Chris Wailes	5c0824a	2023-04-24 16:30:59 -0700	[diff] [blame]	179	/// "12.34f32", "1e3", but not "1f32".
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	180	Float { base: Base, empty_exponent: bool },
				181	/// "'a'", "'\\'", "'''", "';"
				182	Char { terminated: bool },
				183	/// "b'a'", "b'\\'", "b'''", "b';"
				184	Byte { terminated: bool },
				185	/// ""abc"", ""abc"
				186	Str { terminated: bool },
				187	/// "b"abc"", "b"abc"
				188	ByteStr { terminated: bool },
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	189	/// `c"abc"`, `c"abc`
				190	CStr { terminated: bool },
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	191	/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
				192	/// an invalid literal.
				193	RawStr { n_hashes: Option<u8> },
				194	/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
				195	/// indicates an invalid literal.
				196	RawByteStr { n_hashes: Option<u8> },
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	197	/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
				198	RawCStr { n_hashes: Option<u8> },
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	199	}
				200
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	201	#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
				202	pub enum RawStrError {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	203	/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	204	InvalidStarter { bad_char: char },
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	205	/// The string was not terminated, e.g. `r###"abcde"##`.
				206	/// `possible_terminator_offset` is the number of characters after `r` or
				207	/// `br` where they may have intended to terminate it.
				208	NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
Charisee	341341c	2022-05-20 05:14:50 +0000	[diff] [blame]	209	/// More than 255 `#`s exist.
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	210	TooManyDelimiters { found: u32 },
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	211	}
				212
				213	/// Base of numeric literal encoding according to its prefix.
				214	#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
				215	pub enum Base {
				216	/// Literal starts with "0b".
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	217	Binary = 2,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	218	/// Literal starts with "0o".
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	219	Octal = 8,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	220	/// Literal doesn't contain a prefix.
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	221	Decimal = 10,
				222	/// Literal starts with "0x".
				223	Hexadecimal = 16,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	224	}
				225
				226	/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
				227	/// but shebang isn't a part of rust syntax.
				228	pub fn strip_shebang(input: &str) -> Option<usize> {
				229	// Shebang must start with `#!` literally, without any preceding whitespace.
				230	// For simplicity we consider any line starting with `#!` a shebang,
				231	// regardless of restrictions put on shebangs by specific platforms.
				232	if let Some(input_tail) = input.strip_prefix("#!") {
				233	// Ok, this is a shebang but if the next non-whitespace token is `[`,
				234	// then it may be valid Rust code, so consider it Rust code.
				235	let next_non_whitespace_token = tokenize(input_tail).map(\|tok\| tok.kind).find(\|tok\| {
				236	!matches!(
				237	tok,
				238	TokenKind::Whitespace
				239	\| TokenKind::LineComment { doc_style: None }
				240	\| TokenKind::BlockComment { doc_style: None, .. }
				241	)
				242	});
				243	if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
				244	// No other choice than to consider this a shebang.
				245	return Some(2 + input_tail.lines().next().unwrap_or_default().len());
				246	}
				247	}
				248	None
				249	}
				250
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	251	/// Validates a raw string literal. Used for getting more information about a
				252	/// problem with a `RawStr`/`RawByteStr` with a `None` field.
				253	#[inline]
				254	pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
				255	debug_assert!(!input.is_empty());
				256	let mut cursor = Cursor::new(input);
				257	// Move past the leading `r` or `br`.
				258	for _ in 0..prefix_len {
				259	cursor.bump().unwrap();
				260	}
				261	cursor.raw_double_quoted_string(prefix_len).map(\|_\| ())
				262	}
				263
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	264	/// Creates an iterator that produces tokens from the input string.
Charisee	7878d54	2022-02-24 18:21:36 +0000	[diff] [blame]	265	pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
				266	let mut cursor = Cursor::new(input);
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	267	std::iter::from_fn(move \|\| {
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	268	let token = cursor.advance_token();
				269	if token.kind != TokenKind::Eof { Some(token) } else { None }
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	270	})
				271	}
				272
				273	/// True if `c` is considered a whitespace according to Rust language definition.
				274	/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
				275	/// for definitions of these classes.
				276	pub fn is_whitespace(c: char) -> bool {
				277	// This is Pattern_White_Space.
				278	//
				279	// Note that this set is stable (ie, it doesn't change with different
				280	// Unicode versions), so it's ok to just hard-code the values.
				281
Thiébaud Weksteen	5bd94c1	2021-01-06 15:18:42 +0100	[diff] [blame]	282	matches!(
				283	c,
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	284	// Usual ASCII suspects
Thiébaud Weksteen	5bd94c1	2021-01-06 15:18:42 +0100	[diff] [blame]	285	'\u{0009}' // \t
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	286	\| '\u{000A}' // \n
				287	\| '\u{000B}' // vertical tab
				288	\| '\u{000C}' // form feed
				289	\| '\u{000D}' // \r
				290	\| '\u{0020}' // space
				291
				292	// NEXT LINE from latin1
				293	\| '\u{0085}'
				294
				295	// Bidi markers
				296	\| '\u{200E}' // LEFT-TO-RIGHT MARK
				297	\| '\u{200F}' // RIGHT-TO-LEFT MARK
				298
				299	// Dedicated whitespace characters from Unicode
				300	\| '\u{2028}' // LINE SEPARATOR
				301	\| '\u{2029}' // PARAGRAPH SEPARATOR
Thiébaud Weksteen	5bd94c1	2021-01-06 15:18:42 +0100	[diff] [blame]	302	)
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	303	}
				304
				305	/// True if `c` is valid as a first character of an identifier.
				306	/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
				307	/// a formal definition of valid identifier name.
				308	pub fn is_id_start(c: char) -> bool {
				309	// This is XID_Start OR '_' (which formally is not a XID_Start).
Chris Wailes	bcf972c	2021-10-21 11:03:28 -0700	[diff] [blame]	310	c == '_' \|\| unicode_xid::UnicodeXID::is_xid_start(c)
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	311	}
				312
				313	/// True if `c` is valid as a non-first character of an identifier.
				314	/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
				315	/// a formal definition of valid identifier name.
				316	pub fn is_id_continue(c: char) -> bool {
Chris Wailes	bcf972c	2021-10-21 11:03:28 -0700	[diff] [blame]	317	unicode_xid::UnicodeXID::is_xid_continue(c)
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	318	}
				319
				320	/// The passed string is lexically an identifier.
				321	pub fn is_ident(string: &str) -> bool {
				322	let mut chars = string.chars();
				323	if let Some(start) = chars.next() {
				324	is_id_start(start) && chars.all(is_id_continue)
				325	} else {
				326	false
				327	}
				328	}
				329
				330	impl Cursor<'_> {
				331	/// Parses a token from the input string.
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	332	pub fn advance_token(&mut self) -> Token {
				333	let first_char = match self.bump() {
				334	Some(c) => c,
				335	None => return Token::new(TokenKind::Eof, 0),
				336	};
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	337	let token_kind = match first_char {
				338	// Slash, comment or block comment.
				339	'/' => match self.first() {
				340	'/' => self.line_comment(),
				341	'*' => self.block_comment(),
				342	_ => Slash,
				343	},
				344
				345	// Whitespace sequence.
				346	c if is_whitespace(c) => self.whitespace(),
				347
				348	// Raw identifier, raw string literal or identifier.
				349	'r' => match (self.first(), self.second()) {
				350	('#', c1) if is_id_start(c1) => self.raw_ident(),
				351	('#', _) \| ('"', _) => {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	352	let res = self.raw_double_quoted_string(1);
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	353	let suffix_start = self.pos_within_token();
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	354	if res.is_ok() {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	355	self.eat_literal_suffix();
				356	}
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	357	let kind = RawStr { n_hashes: res.ok() };
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	358	Literal { kind, suffix_start }
				359	}
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	360	_ => self.ident_or_unknown_prefix(),
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	361	},
				362
				363	// Byte literal, byte string literal, raw byte string literal or identifier.
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	364	'b' => self.c_or_byte_string(
				365	\|terminated\| ByteStr { terminated },
				366	\|n_hashes\| RawByteStr { n_hashes },
				367	Some(\|terminated\| Byte { terminated }),
				368	),
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	369
				370	// Identifier (this should be checked after other variant that can
				371	// start as identifier).
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	372	c if is_id_start(c) => self.ident_or_unknown_prefix(),
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	373
				374	// Numeric literal.
				375	c @ '0'..='9' => {
				376	let literal_kind = self.number(c);
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	377	let suffix_start = self.pos_within_token();
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	378	self.eat_literal_suffix();
				379	TokenKind::Literal { kind: literal_kind, suffix_start }
				380	}
				381
				382	// One-symbol tokens.
				383	';' => Semi,
				384	',' => Comma,
				385	'.' => Dot,
				386	'(' => OpenParen,
				387	')' => CloseParen,
				388	'{' => OpenBrace,
				389	'}' => CloseBrace,
				390	'[' => OpenBracket,
				391	']' => CloseBracket,
				392	'@' => At,
				393	'#' => Pound,
				394	'~' => Tilde,
				395	'?' => Question,
				396	':' => Colon,
				397	'$' => Dollar,
				398	'=' => Eq,
				399	'!' => Bang,
				400	'<' => Lt,
				401	'>' => Gt,
				402	'-' => Minus,
				403	'&' => And,
				404	'\|' => Or,
				405	'+' => Plus,
				406	'*' => Star,
				407	'^' => Caret,
				408	'%' => Percent,
				409
				410	// Lifetime or character literal.
				411	'\'' => self.lifetime_or_char(),
				412
				413	// String literal.
				414	'"' => {
				415	let terminated = self.double_quoted_string();
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	416	let suffix_start = self.pos_within_token();
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	417	if terminated {
				418	self.eat_literal_suffix();
				419	}
				420	let kind = Str { terminated };
				421	Literal { kind, suffix_start }
				422	}
Chris Wailes	356b57e	2022-01-13 10:08:24 -0800	[diff] [blame]	423	// Identifier starting with an emoji. Only lexed for graceful error recovery.
				424	c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
				425	self.fake_ident_or_unknown_prefix()
				426	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	427	_ => Unknown,
				428	};
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	429	let res = Token::new(token_kind, self.pos_within_token());
				430	self.reset_pos_within_token();
				431	res
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	432	}
				433
				434	fn line_comment(&mut self) -> TokenKind {
				435	debug_assert!(self.prev() == '/' && self.first() == '/');
				436	self.bump();
				437
				438	let doc_style = match self.first() {
				439	// `//!` is an inner line doc comment.
				440	'!' => Some(DocStyle::Inner),
				441	// `////` (more than 3 slashes) is not considered a doc comment.
				442	'/' if self.second() != '/' => Some(DocStyle::Outer),
				443	_ => None,
				444	};
				445
				446	self.eat_while(\|c\| c != '\n');
				447	LineComment { doc_style }
				448	}
				449
				450	fn block_comment(&mut self) -> TokenKind {
				451	debug_assert!(self.prev() == '/' && self.first() == '*');
				452	self.bump();
				453
				454	let doc_style = match self.first() {
				455	// `/*!` is an inner block doc comment.
				456	'!' => Some(DocStyle::Inner),
				457	// `/***` (more than 2 stars) is not considered a doc comment.
				458	// `/**/` is not considered a doc comment.
				459	'' if !matches!(self.second(), '' \| '/') => Some(DocStyle::Outer),
				460	_ => None,
				461	};
				462
				463	let mut depth = 1usize;
				464	while let Some(c) = self.bump() {
				465	match c {
				466	'/' if self.first() == '*' => {
				467	self.bump();
				468	depth += 1;
				469	}
				470	'*' if self.first() == '/' => {
				471	self.bump();
				472	depth -= 1;
				473	if depth == 0 {
				474	// This block comment is closed, so for a construction like "/* / /"
				475	// there will be a successfully parsed block comment "/* */"
				476	// and " */" will be processed separately.
				477	break;
				478	}
				479	}
				480	_ => (),
				481	}
				482	}
				483
				484	BlockComment { doc_style, terminated: depth == 0 }
				485	}
				486
				487	fn whitespace(&mut self) -> TokenKind {
				488	debug_assert!(is_whitespace(self.prev()));
				489	self.eat_while(is_whitespace);
				490	Whitespace
				491	}
				492
				493	fn raw_ident(&mut self) -> TokenKind {
				494	debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
				495	// Eat "#" symbol.
				496	self.bump();
				497	// Eat the identifier part of RawIdent.
				498	self.eat_identifier();
				499	RawIdent
				500	}
				501
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	502	fn ident_or_unknown_prefix(&mut self) -> TokenKind {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	503	debug_assert!(is_id_start(self.prev()));
				504	// Start is already eaten, eat the rest of identifier.
				505	self.eat_while(is_id_continue);
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	506	// Known prefixes must have been handled earlier. So if
Chris Wailes	bcf972c	2021-10-21 11:03:28 -0700	[diff] [blame]	507	// we see a prefix here, it is definitely an unknown prefix.
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	508	match self.first() {
				509	'#' \| '"' \| '\'' => UnknownPrefix,
Chris Wailes	356b57e	2022-01-13 10:08:24 -0800	[diff] [blame]	510	c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
				511	self.fake_ident_or_unknown_prefix()
				512	}
Chris Wailes	54272ac	2021-09-09 16:08:13 -0700	[diff] [blame]	513	_ => Ident,
				514	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	515	}
				516
Chris Wailes	356b57e	2022-01-13 10:08:24 -0800	[diff] [blame]	517	fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
				518	// Start is already eaten, eat the rest of identifier.
				519	self.eat_while(\|c\| {
				520	unicode_xid::UnicodeXID::is_xid_continue(c)
				521	\|\| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
				522	\|\| c == '\u{200d}'
				523	});
				524	// Known prefixes must have been handled earlier. So if
				525	// we see a prefix here, it is definitely an unknown prefix.
				526	match self.first() {
				527	'#' \| '"' \| '\'' => UnknownPrefix,
				528	_ => InvalidIdent,
				529	}
				530	}
				531
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	532	fn c_or_byte_string(
				533	&mut self,
				534	mk_kind: impl FnOnce(bool) -> LiteralKind,
				535	mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
				536	single_quoted: Option<fn(bool) -> LiteralKind>,
				537	) -> TokenKind {
				538	match (self.first(), self.second(), single_quoted) {
				539	('\'', _, Some(mk_kind)) => {
				540	self.bump();
				541	let terminated = self.single_quoted_string();
				542	let suffix_start = self.pos_within_token();
				543	if terminated {
				544	self.eat_literal_suffix();
				545	}
				546	let kind = mk_kind(terminated);
				547	Literal { kind, suffix_start }
				548	}
				549	('"', _, _) => {
				550	self.bump();
				551	let terminated = self.double_quoted_string();
				552	let suffix_start = self.pos_within_token();
				553	if terminated {
				554	self.eat_literal_suffix();
				555	}
				556	let kind = mk_kind(terminated);
				557	Literal { kind, suffix_start }
				558	}
				559	('r', '"', _) \| ('r', '#', _) => {
				560	self.bump();
				561	let res = self.raw_double_quoted_string(2);
				562	let suffix_start = self.pos_within_token();
				563	if res.is_ok() {
				564	self.eat_literal_suffix();
				565	}
				566	let kind = mk_kind_raw(res.ok());
				567	Literal { kind, suffix_start }
				568	}
				569	_ => self.ident_or_unknown_prefix(),
				570	}
				571	}
				572
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	573	fn number(&mut self, first_digit: char) -> LiteralKind {
				574	debug_assert!('0' <= self.prev() && self.prev() <= '9');
				575	let mut base = Base::Decimal;
				576	if first_digit == '0' {
				577	// Attempt to parse encoding base.
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	578	match self.first() {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	579	'b' => {
				580	base = Base::Binary;
				581	self.bump();
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	582	if !self.eat_decimal_digits() {
				583	return Int { base, empty_int: true };
				584	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	585	}
				586	'o' => {
				587	base = Base::Octal;
				588	self.bump();
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	589	if !self.eat_decimal_digits() {
				590	return Int { base, empty_int: true };
				591	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	592	}
				593	'x' => {
				594	base = Base::Hexadecimal;
				595	self.bump();
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	596	if !self.eat_hexadecimal_digits() {
				597	return Int { base, empty_int: true };
				598	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	599	}
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	600	// Not a base prefix; consume additional digits.
				601	'0'..='9' \| '_' => {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	602	self.eat_decimal_digits();
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	603	}
Chris Wailes	cd1aefd	2023-07-13 13:36:21 -0700	[diff] [blame^]	604
				605	// Also not a base prefix; nothing more to do here.
				606	'.' \| 'e' \| 'E' => {}
				607
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	608	// Just a 0.
				609	_ => return Int { base, empty_int: false },
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	610	}
				611	} else {
				612	// No base prefix, parse number in the usual way.
				613	self.eat_decimal_digits();
				614	};
				615
				616	match self.first() {
				617	// Don't be greedy if this is actually an
				618	// integer literal followed by field/method access or a range pattern
				619	// (`0..2` and `12.foo()`)
				620	'.' if self.second() != '.' && !is_id_start(self.second()) => {
				621	// might have stuff after the ., and if it does, it needs to start
				622	// with a number
				623	self.bump();
				624	let mut empty_exponent = false;
				625	if self.first().is_digit(10) {
				626	self.eat_decimal_digits();
				627	match self.first() {
				628	'e' \| 'E' => {
				629	self.bump();
				630	empty_exponent = !self.eat_float_exponent();
				631	}
				632	_ => (),
				633	}
				634	}
				635	Float { base, empty_exponent }
				636	}
				637	'e' \| 'E' => {
				638	self.bump();
				639	let empty_exponent = !self.eat_float_exponent();
				640	Float { base, empty_exponent }
				641	}
				642	_ => Int { base, empty_int: false },
				643	}
				644	}
				645
				646	fn lifetime_or_char(&mut self) -> TokenKind {
				647	debug_assert!(self.prev() == '\'');
				648
				649	let can_be_a_lifetime = if self.second() == '\'' {
				650	// It's surely not a lifetime.
				651	false
				652	} else {
				653	// If the first symbol is valid for identifier, it can be a lifetime.
				654	// Also check if it's a number for a better error reporting (so '0 will
				655	// be reported as invalid lifetime and not as unterminated char literal).
				656	is_id_start(self.first()) \|\| self.first().is_digit(10)
				657	};
				658
				659	if !can_be_a_lifetime {
				660	let terminated = self.single_quoted_string();
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	661	let suffix_start = self.pos_within_token();
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	662	if terminated {
				663	self.eat_literal_suffix();
				664	}
				665	let kind = Char { terminated };
				666	return Literal { kind, suffix_start };
				667	}
				668
				669	// Either a lifetime or a character literal with
				670	// length greater than 1.
				671
				672	let starts_with_number = self.first().is_digit(10);
				673
				674	// Skip the literal contents.
				675	// First symbol can be a number (which isn't a valid identifier start),
				676	// so skip it without any checks.
				677	self.bump();
				678	self.eat_while(is_id_continue);
				679
				680	// Check if after skipping literal contents we've met a closing
				681	// single quote (which means that user attempted to create a
				682	// string with single quotes).
				683	if self.first() == '\'' {
				684	self.bump();
				685	let kind = Char { terminated: true };
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	686	Literal { kind, suffix_start: self.pos_within_token() }
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	687	} else {
				688	Lifetime { starts_with_number }
				689	}
				690	}
				691
				692	fn single_quoted_string(&mut self) -> bool {
				693	debug_assert!(self.prev() == '\'');
				694	// Check if it's a one-symbol literal.
				695	if self.second() == '\'' && self.first() != '\\' {
				696	self.bump();
				697	self.bump();
				698	return true;
				699	}
				700
				701	// Literal has more than one symbol.
				702
				703	// Parse until either quotes are terminated or error is detected.
				704	loop {
				705	match self.first() {
				706	// Quotes are terminated, finish parsing.
				707	'\'' => {
				708	self.bump();
				709	return true;
				710	}
				711	// Probably beginning of the comment, which we don't want to include
				712	// to the error report.
				713	'/' => break,
				714	// Newline without following '\'' means unclosed quote, stop parsing.
				715	'\n' if self.second() != '\'' => break,
				716	// End of file, stop parsing.
				717	EOF_CHAR if self.is_eof() => break,
				718	// Escaped slash is considered one character, so bump twice.
				719	'\\' => {
				720	self.bump();
				721	self.bump();
				722	}
				723	// Skip the character.
				724	_ => {
				725	self.bump();
				726	}
				727	}
				728	}
				729	// String was not terminated.
				730	false
				731	}
				732
				733	/// Eats double-quoted string and returns true
				734	/// if string is terminated.
				735	fn double_quoted_string(&mut self) -> bool {
				736	debug_assert!(self.prev() == '"');
				737	while let Some(c) = self.bump() {
				738	match c {
				739	'"' => {
				740	return true;
				741	}
				742	'\\' if self.first() == '\\' \|\| self.first() == '"' => {
				743	// Bump again to skip escaped character.
				744	self.bump();
				745	}
				746	_ => (),
				747	}
				748	}
				749	// End of file reached.
				750	false
				751	}
				752
				753	/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	754	fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	755	// Wrap the actual function to handle the error with too many hashes.
				756	// This way, it eats the whole raw string.
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	757	let n_hashes = self.raw_string_unvalidated(prefix_len)?;
Charisee	341341c	2022-05-20 05:14:50 +0000	[diff] [blame]	758	// Only up to 255 `#`s are allowed in raw strings
				759	match u8::try_from(n_hashes) {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	760	Ok(num) => Ok(num),
				761	Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	762	}
				763	}
				764
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	765	fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	766	debug_assert!(self.prev() == 'r');
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	767	let start_pos = self.pos_within_token();
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	768	let mut possible_terminator_offset = None;
				769	let mut max_hashes = 0;
				770
				771	// Count opening '#' symbols.
Thiébaud Weksteen	5bd94c1	2021-01-06 15:18:42 +0100	[diff] [blame]	772	let mut eaten = 0;
				773	while self.first() == '#' {
				774	eaten += 1;
				775	self.bump();
				776	}
				777	let n_start_hashes = eaten;
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	778
				779	// Check that string is started.
				780	match self.bump() {
				781	Some('"') => (),
				782	c => {
				783	let c = c.unwrap_or(EOF_CHAR);
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	784	return Err(RawStrError::InvalidStarter { bad_char: c });
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	785	}
				786	}
				787
				788	// Skip the string contents and on each '#' character met, check if this is
				789	// a raw string termination.
				790	loop {
				791	self.eat_while(\|c\| c != '"');
				792
				793	if self.is_eof() {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	794	return Err(RawStrError::NoTerminator {
				795	expected: n_start_hashes,
				796	found: max_hashes,
				797	possible_terminator_offset,
				798	});
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	799	}
				800
				801	// Eat closing double quote.
				802	self.bump();
				803
				804	// Check that amount of closing '#' symbols
				805	// is equal to the amount of opening ones.
				806	// Note that this will not consume extra trailing `#` characters:
				807	// `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
				808	// followed by a `#` token.
Thiébaud Weksteen	5bd94c1	2021-01-06 15:18:42 +0100	[diff] [blame]	809	let mut n_end_hashes = 0;
				810	while self.first() == '#' && n_end_hashes < n_start_hashes {
				811	n_end_hashes += 1;
				812	self.bump();
				813	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	814
				815	if n_end_hashes == n_start_hashes {
Charisee	b1d3280	2022-09-22 15:38:41 +0000	[diff] [blame]	816	return Ok(n_start_hashes);
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	817	} else if n_end_hashes > max_hashes {
				818	// Keep track of possible terminators to give a hint about
				819	// where there might be a missing terminator
				820	possible_terminator_offset =
Charisee	f7ad1c4	2023-01-30 22:46:42 +0000	[diff] [blame]	821	Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	822	max_hashes = n_end_hashes;
				823	}
				824	}
				825	}
				826
				827	fn eat_decimal_digits(&mut self) -> bool {
				828	let mut has_digits = false;
				829	loop {
				830	match self.first() {
				831	'_' => {
				832	self.bump();
				833	}
				834	'0'..='9' => {
				835	has_digits = true;
				836	self.bump();
				837	}
				838	_ => break,
				839	}
				840	}
				841	has_digits
				842	}
				843
				844	fn eat_hexadecimal_digits(&mut self) -> bool {
				845	let mut has_digits = false;
				846	loop {
				847	match self.first() {
				848	'_' => {
				849	self.bump();
				850	}
				851	'0'..='9' \| 'a'..='f' \| 'A'..='F' => {
				852	has_digits = true;
				853	self.bump();
				854	}
				855	_ => break,
				856	}
				857	}
				858	has_digits
				859	}
				860
				861	/// Eats the float exponent. Returns true if at least one digit was met,
				862	/// and returns false otherwise.
				863	fn eat_float_exponent(&mut self) -> bool {
				864	debug_assert!(self.prev() == 'e' \|\| self.prev() == 'E');
				865	if self.first() == '-' \|\| self.first() == '+' {
				866	self.bump();
				867	}
				868	self.eat_decimal_digits()
				869	}
				870
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	871	// Eats the suffix of the literal, e.g. "u8".
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	872	fn eat_literal_suffix(&mut self) {
				873	self.eat_identifier();
				874	}
				875
Chris Wailes	977026a	2023-02-13 09:13:10 -0800	[diff] [blame]	876	// Eats the identifier. Note: succeeds on `_`, which isn't a valid
Charisee	d720b3f	2023-03-09 17:35:07 +0000	[diff] [blame]	877	// identifier.
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	878	fn eat_identifier(&mut self) {
				879	if !is_id_start(self.first()) {
				880	return;
				881	}
				882	self.bump();
				883
				884	self.eat_while(is_id_continue);
				885	}
Thiébaud Weksteen	3b664ca	2020-11-26 14:41:59 +0100	[diff] [blame]	886	}