compiler/rustc_lint/src/non_ascii_idents.rs - toolchain/rustc - Git at Google

 use crate::{EarlyContext, EarlyLintPass, LintContext};
 use rustc_ast as ast;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_errors::fluent;
 use rustc_span::symbol::Symbol;

 declare_lint! {
     /// The `non_ascii_idents` lint detects non-ASCII identifiers.
     ///
     /// ### Example
     ///
     /// ```rust,compile_fail
     /// # #![allow(unused)]
     /// #![deny(non_ascii_idents)]
     /// fn main() {
     ///     let föö = 1;
     /// }
     /// ```
     ///
     /// {{produces}}
     ///
     /// ### Explanation
     ///
     /// This lint allows projects that wish to retain the limit of only using
     /// ASCII characters to switch this lint to "forbid" (for example to ease
     /// collaboration or for security reasons).
     /// See [RFC 2457] for more details.
     ///
     /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
     pub NON_ASCII_IDENTS,
     Allow,
     "detects non-ASCII identifiers",
     crate_level_only
 }

 declare_lint! {
     /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
     /// identifiers.
     ///
     /// ### Example
     ///
     /// ```rust
     /// # #![allow(unused)]
     /// const µ: f64 = 0.000001;
     /// ```
     ///
     /// {{produces}}
     ///
     /// ### Explanation
     ///
     /// This lint warns about using characters which are not commonly used, and may
     /// cause visual confusion.
     ///
     /// This lint is triggered by identifiers that contain a codepoint that is
     /// not part of the set of "Allowed" codepoints as described by [Unicode®
     /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
     /// Security Profile for Identifiers][TR39Allowed].
     ///
     /// Note that the set of uncommon codepoints may change over time. Beware
     /// that if you "forbid" this lint that existing code may fail in the
     /// future.
     ///
     /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
     pub UNCOMMON_CODEPOINTS,
     Warn,
     "detects uncommon Unicode codepoints in identifiers",
     crate_level_only
 }

 declare_lint! {
     /// The `confusable_idents` lint detects visually confusable pairs between
     /// identifiers.
     ///
     /// ### Example
     ///
     /// ```rust
     /// // Latin Capital Letter E With Caron
     /// pub const Ě: i32 = 1;
     /// // Latin Capital Letter E With Breve
     /// pub const Ĕ: i32 = 2;
     /// ```
     ///
     /// {{produces}}
     ///
     /// ### Explanation
     ///
     /// This lint warns when different identifiers may appear visually similar,
     /// which can cause confusion.
     ///
     /// The confusable detection algorithm is based on [Unicode® Technical
     /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
     /// Detection][TR39Confusable]. For every distinct identifier X execute
     /// the function `skeleton(X)`. If there exist two distinct identifiers X
     /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
     /// The compiler uses the same mechanism to check if an identifier is too
     /// similar to a keyword.
     ///
     /// Note that the set of confusable characters may change over time.
     /// Beware that if you "forbid" this lint that existing code may fail in
     /// the future.
     ///
     /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
     pub CONFUSABLE_IDENTS,
     Warn,
     "detects visually confusable pairs between identifiers",
     crate_level_only
 }

 declare_lint! {
     /// The `mixed_script_confusables` lint detects visually confusable
     /// characters in identifiers between different [scripts].
     ///
     /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
     ///
     /// ### Example
     ///
     /// ```rust
     /// // The Japanese katakana character エ can be confused with the Han character 工.
     /// const エ: &'static str = "アイウ";
     /// ```
     ///
     /// {{produces}}
     ///
     /// ### Explanation
     ///
     /// This lint warns when characters between different scripts may appear
     /// visually similar, which can cause confusion.
     ///
     /// If the crate contains other identifiers in the same script that have
     /// non-confusable characters, then this lint will *not* be issued. For
     /// example, if the example given above has another identifier with
     /// katakana characters (such as `let カタカナ = 123;`), then this indicates
     /// that you are intentionally using katakana, and it will not warn about
     /// it.
     ///
     /// Note that the set of confusable characters may change over time.
     /// Beware that if you "forbid" this lint that existing code may fail in
     /// the future.
     pub MIXED_SCRIPT_CONFUSABLES,
     Warn,
     "detects Unicode scripts whose mixed script confusables codepoints are solely used",
     crate_level_only
 }

 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);

 impl EarlyLintPass for NonAsciiIdents {
     fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
         use rustc_session::lint::Level;
         use rustc_span::Span;
         use std::collections::BTreeMap;
         use unicode_security::GeneralSecurityProfile;

         let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
         let check_uncommon_codepoints =
             cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
         let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
         let check_mixed_script_confusables =
             cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;

         if !check_non_ascii_idents
             && !check_uncommon_codepoints
             && !check_confusable_idents
             && !check_mixed_script_confusables
         {
             return;
         }

         let mut has_non_ascii_idents = false;
         let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();

         // Sort by `Span` so that error messages make sense with respect to the
         // order of identifier locations in the code.
         let mut symbols: Vec<_> = symbols.iter().collect();
         symbols.sort_by_key(|k| k.1);

         for (symbol, &sp) in symbols.iter() {
             let symbol_str = symbol.as_str();
             if symbol_str.is_ascii() {
                 continue;
             }
             has_non_ascii_idents = true;
             cx.struct_span_lint(
                 NON_ASCII_IDENTS,
                 sp,
                 fluent::lint_identifier_non_ascii_char,
                 |lint| lint,
             );
             if check_uncommon_codepoints
                 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
             {
                 cx.struct_span_lint(
                     UNCOMMON_CODEPOINTS,
                     sp,
                     fluent::lint_identifier_uncommon_codepoints,
                     |lint| lint,
                 )
             }
         }

         if has_non_ascii_idents && check_confusable_idents {
             let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
                 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
             let mut skeleton_buf = String::new();

             for (&symbol, &sp) in symbols.iter() {
                 use unicode_security::confusable_detection::skeleton;

                 let symbol_str = symbol.as_str();
                 let is_ascii = symbol_str.is_ascii();

                 // Get the skeleton as a `Symbol`.
                 skeleton_buf.clear();
                 skeleton_buf.extend(skeleton(&symbol_str));
                 let skeleton_sym = if *symbol_str == *skeleton_buf {
                     symbol
                 } else {
                     Symbol::intern(&skeleton_buf)
                 };

                 skeleton_map
                     .entry(skeleton_sym)
                     .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
                         if !*existing_is_ascii || !is_ascii {
                             cx.struct_span_lint(
                                 CONFUSABLE_IDENTS,
                                 sp,
                                 fluent::lint_confusable_identifier_pair,
                                 |lint| {
                                     lint.set_arg("existing_sym", *existing_symbol)
                                         .set_arg("sym", symbol)
                                         .span_label(*existing_span, fluent::label)
                                 },
                             );
                         }
                         if *existing_is_ascii && !is_ascii {
                             *existing_symbol = symbol;
                             *existing_span = sp;
                             *existing_is_ascii = is_ascii;
                         }
                     })
                     .or_insert((symbol, sp, is_ascii));
             }
         }

         if has_non_ascii_idents && check_mixed_script_confusables {
             use unicode_security::is_potential_mixed_script_confusable_char;
             use unicode_security::mixed_script::AugmentedScriptSet;

             #[derive(Clone)]
             enum ScriptSetUsage {
                 Suspicious(Vec<char>, Span),
                 Verified,
             }

             let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
                 FxHashMap::default();
             let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
             script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);

             let mut has_suspicous = false;
             for (symbol, &sp) in symbols.iter() {
                 let symbol_str = symbol.as_str();
                 for ch in symbol_str.chars() {
                     if ch.is_ascii() {
                         // all ascii characters are covered by exception.
                         continue;
                     }
                     if !GeneralSecurityProfile::identifier_allowed(ch) {
                         // this character is covered by `uncommon_codepoints` lint.
                         continue;
                     }
                     let augmented_script_set = AugmentedScriptSet::for_char(ch);
                     script_states
                         .entry(augmented_script_set)
                         .and_modify(|existing_state| {
                             if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
                                 if is_potential_mixed_script_confusable_char(ch) {
                                     ch_list.push(ch);
                                 } else {
                                     *existing_state = ScriptSetUsage::Verified;
                                 }
                             }
                         })
                         .or_insert_with(|| {
                             if !is_potential_mixed_script_confusable_char(ch) {
                                 ScriptSetUsage::Verified
                             } else {
                                 has_suspicous = true;
                                 ScriptSetUsage::Suspicious(vec![ch], sp)
                             }
                         });
                 }
             }

             if has_suspicous {
                 let verified_augmented_script_sets = script_states
                     .iter()
                     .flat_map(|(k, v)| match v {
                         ScriptSetUsage::Verified => Some(*k),
                         _ => None,
                     })
                     .collect::<Vec<_>>();

                 // we're sorting the output here.
                 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
                     BTreeMap::new();

                 'outerloop: for (augment_script_set, usage) in script_states {
                     let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };

                     if augment_script_set.is_all() {
                         continue;
                     }

                     for existing in verified_augmented_script_sets.iter() {
                         if existing.is_all() {
                             continue;
                         }
                         let mut intersect = *existing;
                         intersect.intersect_with(augment_script_set);
                         if !intersect.is_empty() && !intersect.is_all() {
                             continue 'outerloop;
                         }
                     }

                     // We sort primitive chars here and can use unstable sort
                     ch_list.sort_unstable();
                     ch_list.dedup();
                     lint_reports.insert((sp, ch_list), augment_script_set);
                 }

                 for ((sp, ch_list), script_set) in lint_reports {
                     cx.struct_span_lint(
                         MIXED_SCRIPT_CONFUSABLES,
                         sp,
                         fluent::lint_mixed_script_confusables,
                         |lint| {
                             let mut includes = String::new();
                             for (idx, ch) in ch_list.into_iter().enumerate() {
                                 if idx != 0 {
                                     includes += ", ";
                                 }
                                 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
                                 includes += &char_info;
                             }
                             lint.set_arg("set", script_set.to_string())
                                 .set_arg("includes", includes)
                                 .note(fluent::includes_note)
                                 .note(fluent::note)
                         },
                     );
                 }
             }
         }
     }
 }
	use crate::{EarlyContext, EarlyLintPass, LintContext};
	use rustc_ast as ast;
	use rustc_data_structures::fx::FxHashMap;
	use rustc_errors::fluent;
	use rustc_span::symbol::Symbol;

	declare_lint! {
	/// The `non_ascii_idents` lint detects non-ASCII identifiers.
	///
	/// ### Example
	///
	/// ```rust,compile_fail
	/// # #![allow(unused)]
	/// #![deny(non_ascii_idents)]
	/// fn main() {
	/// let föö = 1;
	/// }
	/// ```
	///
	/// {{produces}}
	///
	/// ### Explanation
	///
	/// This lint allows projects that wish to retain the limit of only using
	/// ASCII characters to switch this lint to "forbid" (for example to ease
	/// collaboration or for security reasons).
	/// See [RFC 2457] for more details.
	///
	/// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
	pub NON_ASCII_IDENTS,
	Allow,
	"detects non-ASCII identifiers",
	crate_level_only
	}

	declare_lint! {
	/// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
	/// identifiers.
	///
	/// ### Example
	///
	/// ```rust
	/// # #![allow(unused)]
	/// const µ: f64 = 0.000001;
	/// ```
	///
	/// {{produces}}
	///
	/// ### Explanation
	///
	/// This lint warns about using characters which are not commonly used, and may
	/// cause visual confusion.
	///
	/// This lint is triggered by identifiers that contain a codepoint that is
	/// not part of the set of "Allowed" codepoints as described by [Unicode®
	/// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
	/// Security Profile for Identifiers][TR39Allowed].
	///
	/// Note that the set of uncommon codepoints may change over time. Beware
	/// that if you "forbid" this lint that existing code may fail in the
	/// future.
	///
	/// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
	pub UNCOMMON_CODEPOINTS,
	Warn,
	"detects uncommon Unicode codepoints in identifiers",
	crate_level_only
	}

	declare_lint! {
	/// The `confusable_idents` lint detects visually confusable pairs between
	/// identifiers.
	///
	/// ### Example
	///
	/// ```rust
	/// // Latin Capital Letter E With Caron
	/// pub const Ě: i32 = 1;
	/// // Latin Capital Letter E With Breve
	/// pub const Ĕ: i32 = 2;
	/// ```
	///
	/// {{produces}}
	///
	/// ### Explanation
	///
	/// This lint warns when different identifiers may appear visually similar,
	/// which can cause confusion.
	///
	/// The confusable detection algorithm is based on [Unicode® Technical
	/// Standard #39 Unicode Security Mechanisms Section 4 Confusable
	/// Detection][TR39Confusable]. For every distinct identifier X execute
	/// the function `skeleton(X)`. If there exist two distinct identifiers X
	/// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
	/// The compiler uses the same mechanism to check if an identifier is too
	/// similar to a keyword.
	///
	/// Note that the set of confusable characters may change over time.
	/// Beware that if you "forbid" this lint that existing code may fail in
	/// the future.
	///
	/// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
	pub CONFUSABLE_IDENTS,
	Warn,
	"detects visually confusable pairs between identifiers",
	crate_level_only
	}

	declare_lint! {
	/// The `mixed_script_confusables` lint detects visually confusable
	/// characters in identifiers between different [scripts].
	///
	/// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
	///
	/// ### Example
	///
	/// ```rust
	/// // The Japanese katakana character エ can be confused with the Han character 工.
	/// const エ: &'static str = "アイウ";
	/// ```
	///
	/// {{produces}}
	///
	/// ### Explanation
	///
	/// This lint warns when characters between different scripts may appear
	/// visually similar, which can cause confusion.
	///
	/// If the crate contains other identifiers in the same script that have
	/// non-confusable characters, then this lint will not be issued. For
	/// example, if the example given above has another identifier with
	/// katakana characters (such as `let カタカナ = 123;`), then this indicates
	/// that you are intentionally using katakana, and it will not warn about
	/// it.
	///
	/// Note that the set of confusable characters may change over time.
	/// Beware that if you "forbid" this lint that existing code may fail in
	/// the future.
	pub MIXED_SCRIPT_CONFUSABLES,
	Warn,
	"detects Unicode scripts whose mixed script confusables codepoints are solely used",
	crate_level_only
	}

	declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);

	impl EarlyLintPass for NonAsciiIdents {
	fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
	use rustc_session::lint::Level;
	use rustc_span::Span;
	use std::collections::BTreeMap;
	use unicode_security::GeneralSecurityProfile;

	let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
	let check_uncommon_codepoints =
	cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
	let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
	let check_mixed_script_confusables =
	cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;

	if !check_non_ascii_idents
	&& !check_uncommon_codepoints
	&& !check_confusable_idents
	&& !check_mixed_script_confusables
	{
	return;
	}

	let mut has_non_ascii_idents = false;
	let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();

	// Sort by `Span` so that error messages make sense with respect to the
	// order of identifier locations in the code.
	let mut symbols: Vec<_> = symbols.iter().collect();
	symbols.sort_by_key(\|k\| k.1);

	for (symbol, &sp) in symbols.iter() {
	let symbol_str = symbol.as_str();
	if symbol_str.is_ascii() {
	continue;
	}
	has_non_ascii_idents = true;
	cx.struct_span_lint(
	NON_ASCII_IDENTS,
	sp,
	fluent::lint_identifier_non_ascii_char,
	\|lint\| lint,
	);
	if check_uncommon_codepoints
	&& !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
	{
	cx.struct_span_lint(
	UNCOMMON_CODEPOINTS,
	sp,
	fluent::lint_identifier_uncommon_codepoints,
	\|lint\| lint,
	)
	}
	}

	if has_non_ascii_idents && check_confusable_idents {
	let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
	FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
	let mut skeleton_buf = String::new();

	for (&symbol, &sp) in symbols.iter() {
	use unicode_security::confusable_detection::skeleton;

	let symbol_str = symbol.as_str();
	let is_ascii = symbol_str.is_ascii();

	// Get the skeleton as a `Symbol`.
	skeleton_buf.clear();
	skeleton_buf.extend(skeleton(&symbol_str));
	let skeleton_sym = if symbol_str == skeleton_buf {
	symbol
	} else {
	Symbol::intern(&skeleton_buf)
	};

	skeleton_map
	.entry(skeleton_sym)
	.and_modify(\|(existing_symbol, existing_span, existing_is_ascii)\| {
	if !*existing_is_ascii \|\| !is_ascii {
	cx.struct_span_lint(
	CONFUSABLE_IDENTS,
	sp,
	fluent::lint_confusable_identifier_pair,
	\|lint\| {
	lint.set_arg("existing_sym", *existing_symbol)
	.set_arg("sym", symbol)
	.span_label(*existing_span, fluent::label)
	},
	);
	}
	if *existing_is_ascii && !is_ascii {
	*existing_symbol = symbol;
	*existing_span = sp;
	*existing_is_ascii = is_ascii;
	}
	})
	.or_insert((symbol, sp, is_ascii));
	}
	}

	if has_non_ascii_idents && check_mixed_script_confusables {
	use unicode_security::is_potential_mixed_script_confusable_char;
	use unicode_security::mixed_script::AugmentedScriptSet;

	#[derive(Clone)]
	enum ScriptSetUsage {
	Suspicious(Vec<char>, Span),
	Verified,
	}

	let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
	FxHashMap::default();
	let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
	script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);

	let mut has_suspicous = false;
	for (symbol, &sp) in symbols.iter() {
	let symbol_str = symbol.as_str();
	for ch in symbol_str.chars() {
	if ch.is_ascii() {
	// all ascii characters are covered by exception.
	continue;
	}
	if !GeneralSecurityProfile::identifier_allowed(ch) {
	// this character is covered by `uncommon_codepoints` lint.
	continue;
	}
	let augmented_script_set = AugmentedScriptSet::for_char(ch);
	script_states
	.entry(augmented_script_set)
	.and_modify(\|existing_state\| {
	if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
	if is_potential_mixed_script_confusable_char(ch) {
	ch_list.push(ch);
	} else {
	*existing_state = ScriptSetUsage::Verified;
	}
	}
	})
	.or_insert_with(\|\| {
	if !is_potential_mixed_script_confusable_char(ch) {
	ScriptSetUsage::Verified
	} else {
	has_suspicous = true;
	ScriptSetUsage::Suspicious(vec![ch], sp)
	}
	});
	}
	}

	if has_suspicous {
	let verified_augmented_script_sets = script_states
	.iter()
	.flat_map(\|(k, v)\| match v {
	ScriptSetUsage::Verified => Some(*k),
	_ => None,
	})
	.collect::<Vec<_>>();

	// we're sorting the output here.
	let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
	BTreeMap::new();

	'outerloop: for (augment_script_set, usage) in script_states {
	let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };

	if augment_script_set.is_all() {
	continue;
	}

	for existing in verified_augmented_script_sets.iter() {
	if existing.is_all() {
	continue;
	}
	let mut intersect = *existing;
	intersect.intersect_with(augment_script_set);
	if !intersect.is_empty() && !intersect.is_all() {
	continue 'outerloop;
	}
	}

	// We sort primitive chars here and can use unstable sort
	ch_list.sort_unstable();
	ch_list.dedup();
	lint_reports.insert((sp, ch_list), augment_script_set);
	}

	for ((sp, ch_list), script_set) in lint_reports {
	cx.struct_span_lint(
	MIXED_SCRIPT_CONFUSABLES,
	sp,
	fluent::lint_mixed_script_confusables,
	\|lint\| {
	let mut includes = String::new();
	for (idx, ch) in ch_list.into_iter().enumerate() {
	if idx != 0 {
	includes += ", ";
	}
	let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
	includes += &char_info;
	}
	lint.set_arg("set", script_set.to_string())
	.set_arg("includes", includes)
	.note(fluent::includes_note)
	.note(fluent::note)
	},
	);
	}
	}
	}
	}
	}