Rollup merge of #72770 - crlf0710:mixed_script_confusable, r=Manishearth

Implement mixed script confusable lint.

This implements the mixed script confusable lint defined in RFC 2457.
This is blocked on #72069 and https://github.com/unicode-rs/unicode-security/pull/13, and will need a Cargo.toml version bump after those are resolved.

The lint message warning is sub-optimal for now. We'll need a mechanism to properly output  `AugmentScriptSet` to screen, this is to be added in `unicode-security` crate.

r? @Manishearth
This commit is contained in:
Manish Goregaokar 2020-06-25 18:00:05 -07:00 committed by GitHub
commit 23c9ac6b73
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 344 additions and 176 deletions

View File

@ -5405,15 +5405,15 @@ dependencies = [
[[package]]
name = "unicode-script"
version = "0.4.0"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b2c5c29e805da6817f5af6a627d65adb045cebf05cccd5a3493d6109454391c"
checksum = "58b33414ea8db4b7ea0343548dbdc31d27aef06beacf7044a87e564d9b0feb7d"
[[package]]
name = "unicode-security"
version = "0.0.3"
version = "0.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5f9011bbed9c13372bc8df618b55a38138445199caf3b61d432c6859c36dee0"
checksum = "5d87c28edc5b263377e448d6cdcb935c06b95413d8013ba6fae470558ccab18f"
dependencies = [
"unicode-normalization",
"unicode-script",

View File

@ -10,7 +10,7 @@ path = "lib.rs"
[dependencies]
log = "0.4"
unicode-security = "0.0.3"
unicode-security = "0.0.5"
rustc_middle = { path = "../librustc_middle" }
rustc_ast_pretty = { path = "../librustc_ast_pretty" }
rustc_attr = { path = "../librustc_attr" }

View File

@ -1,9 +1,7 @@
use crate::{EarlyContext, EarlyLintPass, LintContext};
use rustc_ast::ast;
use rustc_data_structures::fx::FxHashMap;
use rustc_span::symbol::{Ident, SymbolStr};
use std::hash::{Hash, Hasher};
use std::ops::Deref;
use rustc_span::symbol::SymbolStr;
declare_lint! {
pub NON_ASCII_IDENTS,
@ -19,158 +17,256 @@ declare_lint! {
crate_level_only
}
// FIXME: Change this to warn.
declare_lint! {
pub CONFUSABLE_IDENTS,
Allow,
Warn,
"detects visually confusable pairs between identifiers",
crate_level_only
}
declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS]);
enum CowBoxSymStr {
Interned(SymbolStr),
Owned(Box<str>),
declare_lint! {
pub MIXED_SCRIPT_CONFUSABLES,
Warn,
"detects Unicode scripts whose mixed script confusables codepoints are solely used",
crate_level_only
}
impl Deref for CowBoxSymStr {
type Target = str;
fn deref(&self) -> &str {
match self {
CowBoxSymStr::Interned(interned) => interned,
CowBoxSymStr::Owned(ref owned) => owned,
}
}
}
impl Hash for CowBoxSymStr {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
Hash::hash(&**self, state)
}
}
impl PartialEq<CowBoxSymStr> for CowBoxSymStr {
#[inline]
fn eq(&self, other: &CowBoxSymStr) -> bool {
PartialEq::eq(&**self, &**other)
}
}
impl Eq for CowBoxSymStr {}
fn calc_skeleton(symbol_str: SymbolStr, buffer: &'_ mut String) -> CowBoxSymStr {
use std::mem::swap;
use unicode_security::confusable_detection::skeleton;
buffer.clear();
buffer.extend(skeleton(&symbol_str));
if symbol_str == *buffer {
CowBoxSymStr::Interned(symbol_str)
} else {
let mut owned = String::new();
swap(buffer, &mut owned);
CowBoxSymStr::Owned(owned.into_boxed_str())
}
}
fn is_in_ascii_confusable_closure(c: char) -> bool {
// FIXME: move this table to `unicode_security` crate.
// data here corresponds to Unicode 13.
const ASCII_CONFUSABLE_CLOSURE: &[(u64, u64)] = &[(0x00, 0x7f), (0xba, 0xba), (0x2080, 0x2080)];
let c = c as u64;
for &(range_start, range_end) in ASCII_CONFUSABLE_CLOSURE {
if c >= range_start && c <= range_end {
return true;
}
}
false
}
fn is_in_ascii_confusable_closure_relevant_list(c: char) -> bool {
// FIXME: move this table to `unicode_security` crate.
// data here corresponds to Unicode 13.
const ASCII_CONFUSABLE_CLOSURE_RELEVANT_LIST: &[u64] = &[
0x22, 0x25, 0x27, 0x2f, 0x30, 0x31, 0x49, 0x4f, 0x60, 0x6c, 0x6d, 0x6e, 0x72, 0x7c, 0xba,
0x2080,
];
let c = c as u64;
for &item in ASCII_CONFUSABLE_CLOSURE_RELEVANT_LIST {
if c == item {
return true;
}
}
false
}
declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
impl EarlyLintPass for NonAsciiIdents {
fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
use rustc_session::lint::Level;
if cx.builder.lint_level(CONFUSABLE_IDENTS).0 == Level::Allow {
return;
}
let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
let mut symbol_strs_and_spans = Vec::with_capacity(symbols.len());
let mut in_fast_path = true;
for (symbol, sp) in symbols.iter() {
// fast path
let symbol_str = symbol.as_str();
if !symbol_str.chars().all(is_in_ascii_confusable_closure) {
// fallback to slow path.
symbol_strs_and_spans.clear();
in_fast_path = false;
break;
}
if symbol_str.chars().any(is_in_ascii_confusable_closure_relevant_list) {
symbol_strs_and_spans.push((symbol_str, *sp));
}
}
if !in_fast_path {
// slow path
for (symbol, sp) in symbols.iter() {
let symbol_str = symbol.as_str();
symbol_strs_and_spans.push((symbol_str, *sp));
}
}
drop(symbols);
symbol_strs_and_spans.sort_by_key(|x| x.0.clone());
let mut skeleton_map =
FxHashMap::with_capacity_and_hasher(symbol_strs_and_spans.len(), Default::default());
let mut str_buf = String::new();
for (symbol_str, sp) in symbol_strs_and_spans {
let skeleton = calc_skeleton(symbol_str.clone(), &mut str_buf);
skeleton_map
.entry(skeleton)
.and_modify(|(existing_symbolstr, existing_span)| {
cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
lint.build(&format!(
"identifier pair considered confusable between `{}` and `{}`",
existing_symbolstr, symbol_str
))
.span_label(
*existing_span,
"this is where the previous identifier occurred",
)
.emit();
});
})
.or_insert((symbol_str, sp));
}
}
fn check_ident(&mut self, cx: &EarlyContext<'_>, ident: Ident) {
use rustc_span::Span;
use std::collections::BTreeMap;
use unicode_security::GeneralSecurityProfile;
let name_str = ident.name.as_str();
if name_str.is_ascii() {
use utils::CowBoxSymStr;
let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
let check_uncommon_codepoints =
cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
let check_mixed_script_confusables =
cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
if !check_non_ascii_idents
&& !check_uncommon_codepoints
&& !check_confusable_idents
&& !check_mixed_script_confusables
{
return;
}
cx.struct_span_lint(NON_ASCII_IDENTS, ident.span, |lint| {
lint.build("identifier contains non-ASCII characters").emit()
});
if !name_str.chars().all(GeneralSecurityProfile::identifier_allowed) {
cx.struct_span_lint(UNCOMMON_CODEPOINTS, ident.span, |lint| {
lint.build("identifier contains uncommon Unicode codepoints").emit()
})
let mut has_non_ascii_idents = false;
let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
for (symbol, &sp) in symbols.iter() {
let symbol_str = symbol.as_str();
if symbol_str.is_ascii() {
continue;
}
has_non_ascii_idents = true;
cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
lint.build("identifier contains non-ASCII characters").emit()
});
if check_uncommon_codepoints
&& !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
{
cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
lint.build("identifier contains uncommon Unicode codepoints").emit()
})
}
}
if has_non_ascii_idents && check_confusable_idents {
let mut skeleton_map: FxHashMap<CowBoxSymStr, (SymbolStr, Span, bool)> =
FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
let mut str_buf = String::new();
for (symbol, &sp) in symbols.iter() {
fn calc_skeleton(symbol_str: &SymbolStr, buffer: &mut String) -> CowBoxSymStr {
use std::mem::replace;
use unicode_security::confusable_detection::skeleton;
buffer.clear();
buffer.extend(skeleton(symbol_str));
if *symbol_str == *buffer {
CowBoxSymStr::Interned(symbol_str.clone())
} else {
let owned = replace(buffer, String::new());
CowBoxSymStr::Owned(owned.into_boxed_str())
}
}
let symbol_str = symbol.as_str();
let is_ascii = symbol_str.is_ascii();
let skeleton = calc_skeleton(&symbol_str, &mut str_buf);
skeleton_map
.entry(skeleton)
.and_modify(|(existing_symbolstr, existing_span, existing_is_ascii)| {
if !*existing_is_ascii || !is_ascii {
cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
lint.build(&format!(
"identifier pair considered confusable between `{}` and `{}`",
existing_symbolstr, symbol_str
))
.span_label(
*existing_span,
"this is where the previous identifier occurred",
)
.emit();
});
}
if *existing_is_ascii && !is_ascii {
*existing_symbolstr = symbol_str.clone();
*existing_span = sp;
*existing_is_ascii = is_ascii;
}
})
.or_insert((symbol_str, sp, is_ascii));
}
}
if has_non_ascii_idents && check_mixed_script_confusables {
use unicode_security::is_potential_mixed_script_confusable_char;
use unicode_security::mixed_script::AugmentedScriptSet;
#[derive(Clone)]
enum ScriptSetUsage {
Suspicious(Vec<char>, Span),
Verified,
}
let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
FxHashMap::default();
let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
let mut has_suspicous = false;
for (symbol, &sp) in symbols.iter() {
let symbol_str = symbol.as_str();
for ch in symbol_str.chars() {
if ch.is_ascii() {
// all ascii characters are covered by exception.
continue;
}
if !GeneralSecurityProfile::identifier_allowed(ch) {
// this character is covered by `uncommon_codepoints` lint.
continue;
}
let augmented_script_set = AugmentedScriptSet::for_char(ch);
script_states
.entry(augmented_script_set)
.and_modify(|existing_state| {
if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
if is_potential_mixed_script_confusable_char(ch) {
ch_list.push(ch);
} else {
*existing_state = ScriptSetUsage::Verified;
}
}
})
.or_insert_with(|| {
if !is_potential_mixed_script_confusable_char(ch) {
ScriptSetUsage::Verified
} else {
has_suspicous = true;
ScriptSetUsage::Suspicious(vec![ch], sp)
}
});
}
}
if has_suspicous {
let verified_augmented_script_sets = script_states
.iter()
.flat_map(|(k, v)| match v {
ScriptSetUsage::Verified => Some(*k),
_ => None,
})
.collect::<Vec<_>>();
// we're sorting the output here.
let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
BTreeMap::new();
'outerloop: for (augment_script_set, usage) in script_states {
let (mut ch_list, sp) = match usage {
ScriptSetUsage::Verified => continue,
ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
};
if augment_script_set.is_all() {
continue;
}
for existing in verified_augmented_script_sets.iter() {
if existing.is_all() {
continue;
}
let mut intersect = *existing;
intersect.intersect_with(augment_script_set);
if !intersect.is_empty() && !intersect.is_all() {
continue 'outerloop;
}
}
ch_list.sort();
ch_list.dedup();
lint_reports.insert((sp, ch_list), augment_script_set);
}
for ((sp, ch_list), script_set) in lint_reports {
cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
let message = format!(
"The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
script_set);
let mut note = "The usage includes ".to_string();
for (idx, ch) in ch_list.into_iter().enumerate() {
if idx != 0 {
note += ", ";
}
let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
note += &char_info;
}
note += ".";
lint.build(&message).note(&note).note("Please recheck to make sure their usages are indeed what you want.").emit()
});
}
}
}
}
}
mod utils {
use rustc_span::symbol::SymbolStr;
use std::hash::{Hash, Hasher};
use std::ops::Deref;
pub(super) enum CowBoxSymStr {
Interned(SymbolStr),
Owned(Box<str>),
}
impl Deref for CowBoxSymStr {
type Target = str;
fn deref(&self) -> &str {
match self {
CowBoxSymStr::Interned(interned) => interned,
CowBoxSymStr::Owned(ref owned) => owned,
}
}
}
impl Hash for CowBoxSymStr {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
Hash::hash(&**self, state)
}
}
impl PartialEq<CowBoxSymStr> for CowBoxSymStr {
#[inline]
fn eq(&self, other: &CowBoxSymStr) -> bool {
PartialEq::eq(&**self, &**other)
}
}
impl Eq for CowBoxSymStr {}
}

View File

@ -13,6 +13,7 @@ use rustc_span::hygiene::ExpnId;
use rustc_span::source_map::{FilePathMapping, SourceMap};
use rustc_span::{MultiSpan, Span, Symbol};
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::str;
@ -63,7 +64,7 @@ impl GatedSpans {
#[derive(Default)]
pub struct SymbolGallery {
/// All symbols occurred and their first occurrance span.
pub symbols: Lock<FxHashMap<Symbol, Span>>,
pub symbols: Lock<BTreeMap<Symbol, Span>>,
}
impl SymbolGallery {

View File

@ -2,8 +2,14 @@
#![deny(confusable_idents)]
#![allow(uncommon_codepoints, non_upper_case_globals)]
const : usize = 42; //~ ERROR identifier pair considered confusable
const : usize = 42;
fn main() {
let s = "rust";
let s = "rust"; //~ ERROR identifier pair considered confusable
not_affected();
}
fn not_affected() {
let s1 = 1;
let sl = 'l';
}

View File

@ -1,11 +1,11 @@
error: identifier pair considered confusable between `s` and ``
--> $DIR/lint-confusable-idents.rs:5:7
error: identifier pair considered confusable between `` and `s`
--> $DIR/lint-confusable-idents.rs:8:9
|
LL | const : usize = 42;
| ^^
| -- this is where the previous identifier occurred
...
LL | let s = "rust";
| - this is where the previous identifier occurred
| ^
|
note: the lint level is defined here
--> $DIR/lint-confusable-idents.rs:2:9

View File

@ -0,0 +1,20 @@
// check-pass
#![feature(non_ascii_idents)]
#![deny(mixed_script_confusables)]
struct ΑctuallyNotLatin;
fn main() {
let λ = 42; // this usage of Greek confirms that Greek is used intentionally.
}
mod роре {
const : &'static str = "アイウ";
// this usage of Katakana confirms that Katakana is used intentionally.
fn () {
let д: usize = 100; // this usage of Cyrillic confirms that Cyrillic is used intentionally.
println!("meow!");
}
}

View File

@ -0,0 +1,15 @@
#![feature(non_ascii_idents)]
#![deny(mixed_script_confusables)]
struct ΑctuallyNotLatin;
//~^ ERROR The usage of Script Group `Greek` in this crate consists solely of
fn main() {
let v = ΑctuallyNotLatin;
}
mod роре {
//~^ ERROR The usage of Script Group `Cyrillic` in this crate consists solely of
const : &'static str = "アイウ";
//~^ ERROR The usage of Script Group `Japanese, Katakana` in this crate consists solely of
}

View File

@ -0,0 +1,34 @@
error: The usage of Script Group `Greek` in this crate consists solely of mixed script confusables
--> $DIR/lint-mixed-script-confusables.rs:4:8
|
LL | struct ΑctuallyNotLatin;
| ^^^^^^^^^^^^^^^^
|
note: the lint level is defined here
--> $DIR/lint-mixed-script-confusables.rs:2:9
|
LL | #![deny(mixed_script_confusables)]
| ^^^^^^^^^^^^^^^^^^^^^^^^
= note: The usage includes 'Α' (U+0391).
= note: Please recheck to make sure their usages are indeed what you want.
error: The usage of Script Group `Cyrillic` in this crate consists solely of mixed script confusables
--> $DIR/lint-mixed-script-confusables.rs:11:5
|
LL | mod роре {
| ^^^^
|
= note: The usage includes 'е' (U+0435), 'о' (U+043E), 'р' (U+0440).
= note: Please recheck to make sure their usages are indeed what you want.
error: The usage of Script Group `Japanese, Katakana` in this crate consists solely of mixed script confusables
--> $DIR/lint-mixed-script-confusables.rs:13:11
|
LL | const エ: &'static str = "アイウ";
| ^^
|
= note: The usage includes 'エ' (U+30A8).
= note: Please recheck to make sure their usages are indeed what you want.
error: aborting due to 3 previous errors

View File

@ -7,5 +7,7 @@ fn coöperation() {} //~ ERROR identifier contains non-ASCII characters
fn main() {
let naïveté = 2; //~ ERROR identifier contains non-ASCII characters
println!("{}", naïveté); //~ ERROR identifier contains non-ASCII characters
// using the same identifier the second time won't trigger the lint.
println!("{}", naïveté);
}

View File

@ -22,11 +22,5 @@ error: identifier contains non-ASCII characters
LL | let naïveté = 2;
| ^^^^^^^
error: identifier contains non-ASCII characters
--> $DIR/lint-non-ascii-idents.rs:10:20
|
LL | println!("{}", naïveté);
| ^^^^^^^
error: aborting due to 4 previous errors
error: aborting due to 3 previous errors

View File

@ -7,5 +7,7 @@ fn dijkstra() {} //~ ERROR identifier contains uncommon Unicode codepoints
fn main() {
let = "rust"; //~ ERROR identifier contains uncommon Unicode codepoints
println!("{}", ); //~ ERROR identifier contains uncommon Unicode codepoints
// using the same identifier the second time won't trigger the lint.
println!("{}", );
}

View File

@ -22,11 +22,5 @@ error: identifier contains uncommon Unicode codepoints
LL | let ㇻㇲㇳ = "rust";
| ^^^^^^
error: identifier contains uncommon Unicode codepoints
--> $DIR/lint-uncommon-codepoints.rs:10:20
|
LL | println!("{}", ㇻㇲㇳ);
| ^^^^^^
error: aborting due to 4 previous errors
error: aborting due to 3 previous errors

View File

@ -1,4 +1,6 @@
// ignore-tidy-trailing-newlines
// error-pattern: aborting due to 3 previous errors
#![allow(uncommon_codepoints)]
y![
Ϥ,

View File

@ -1,5 +1,5 @@
error: this file contains an unclosed delimiter
--> $DIR/issue-62524.rs:4:3
--> $DIR/issue-62524.rs:6:3
|
LL | y![
| - unclosed delimiter
@ -7,7 +7,7 @@ LL | Ϥ,
| ^
error: macros that expand to items must be delimited with braces or followed by a semicolon
--> $DIR/issue-62524.rs:3:3
--> $DIR/issue-62524.rs:5:3
|
LL | y![
| ___^
@ -24,7 +24,7 @@ LL | Ϥ,;
| ^
error: cannot find macro `y` in this scope
--> $DIR/issue-62524.rs:3:1
--> $DIR/issue-62524.rs:5:1
|
LL | y![
| ^

View File

@ -1,3 +1,5 @@
#![allow(mixed_script_confusables)]
fn foo<
'β, //~ ERROR non-ascii idents are not fully supported
γ //~ ERROR non-ascii idents are not fully supported

View File

@ -1,5 +1,5 @@
error[E0658]: non-ascii idents are not fully supported
--> $DIR/utf8_idents.rs:2:5
--> $DIR/utf8_idents.rs:4:5
|
LL | 'β,
| ^^
@ -8,7 +8,7 @@ LL | 'β,
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
error[E0658]: non-ascii idents are not fully supported
--> $DIR/utf8_idents.rs:3:5
--> $DIR/utf8_idents.rs:5:5
|
LL | γ
| ^
@ -17,7 +17,7 @@ LL | γ
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
error[E0658]: non-ascii idents are not fully supported
--> $DIR/utf8_idents.rs:8:5
--> $DIR/utf8_idents.rs:10:5
|
LL | δ: usize
| ^
@ -26,7 +26,7 @@ LL | δ: usize
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
error[E0658]: non-ascii idents are not fully supported
--> $DIR/utf8_idents.rs:12:9
--> $DIR/utf8_idents.rs:14:9
|
LL | let α = 0.00001f64;
| ^
@ -35,7 +35,7 @@ LL | let α = 0.00001f64;
= help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable
warning: type parameter `γ` should have an upper camel case name
--> $DIR/utf8_idents.rs:3:5
--> $DIR/utf8_idents.rs:5:5
|
LL | γ
| ^ help: convert the identifier to upper camel case: `Γ`