From 0bcddfe23a4b96d970f953b99a2e4f28ece3c0d6 Mon Sep 17 00:00:00 2001 From: Charles Lew Date: Sat, 23 Nov 2019 22:33:40 +0800 Subject: [PATCH 1/3] Normalize identifiers in librustc_parse. --- Cargo.lock | 8 ++++++-- src/librustc_parse/Cargo.toml | 1 + src/librustc_parse/lexer/mod.rs | 17 +++++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5132f77e578..02717c85ccf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3696,6 +3696,7 @@ dependencies = [ "smallvec 1.0.0", "syntax", "syntax_pos", + "unicode-normalization", ] [[package]] @@ -4913,9 +4914,12 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.7" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25" +checksum = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf" +dependencies = [ + "smallvec 1.0.0", +] [[package]] name = "unicode-segmentation" diff --git a/src/librustc_parse/Cargo.toml b/src/librustc_parse/Cargo.toml index fb5cb742ab6..73458a444f4 100644 --- a/src/librustc_parse/Cargo.toml +++ b/src/librustc_parse/Cargo.toml @@ -20,3 +20,4 @@ rustc_error_codes = { path = "../librustc_error_codes" } smallvec = { version = "1.0", features = ["union", "may_dangle"] } syntax_pos = { path = "../libsyntax_pos" } syntax = { path = "../libsyntax" } +unicode-normalization = "0.1.11" diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index e5d3927af86..d69cd14d544 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -220,8 +220,7 @@ impl<'a> StringReader<'a> { if is_raw_ident { ident_start = ident_start + BytePos(2); } - // FIXME: perform NFKC normalization here. (Issue #2253) - let sym = self.symbol_from(ident_start); + let sym = self.nfc_symbol_from(ident_start); if is_raw_ident { let span = self.mk_sp(start, self.pos); if !sym.can_be_raw() { @@ -470,6 +469,20 @@ impl<'a> StringReader<'a> { Symbol::intern(self.str_from_to(start, end)) } + /// As symbol_from, with the text normalized into Unicode NFC form. + fn nfc_symbol_from(&self, start: BytePos) -> Symbol { + use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization}; + debug!("taking an normalized ident from {:?} to {:?}", start, self.pos); + let sym = self.str_from(start); + match is_nfc_quick(sym.chars()) { + IsNormalized::Yes => Symbol::intern(sym), + _ => { + let sym_str: String = sym.chars().nfc().collect(); + Symbol::intern(&sym_str) + } + } + } + /// Slice of the source text spanning from `start` up to but excluding `end`. fn str_from_to(&self, start: BytePos, end: BytePos) -> &str { &self.src[self.src_index(start)..self.src_index(end)] From 541d879f710c05aaa47ffd16373d83b949ff1edb Mon Sep 17 00:00:00 2001 From: Charles Lew Date: Sat, 23 Nov 2019 22:37:46 +0800 Subject: [PATCH 2/3] Add a test and bless existing test case. --- src/test/ui/codemap_tests/unicode_2.stderr | 2 +- src/test/ui/rfc-2457/idents-normalized.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 src/test/ui/rfc-2457/idents-normalized.rs diff --git a/src/test/ui/codemap_tests/unicode_2.stderr b/src/test/ui/codemap_tests/unicode_2.stderr index 92634d8e5f9..c01942712d4 100644 --- a/src/test/ui/codemap_tests/unicode_2.stderr +++ b/src/test/ui/codemap_tests/unicode_2.stderr @@ -14,7 +14,7 @@ LL | let _ = ("아あ", 1i42); | = help: valid widths are 8, 16, 32, 64 and 128 -error[E0425]: cannot find value `a̐é` in this scope +error[E0425]: cannot find value `a̐é` in this scope --> $DIR/unicode_2.rs:6:13 | LL | let _ = a̐é; diff --git a/src/test/ui/rfc-2457/idents-normalized.rs b/src/test/ui/rfc-2457/idents-normalized.rs new file mode 100644 index 00000000000..109cec7548e --- /dev/null +++ b/src/test/ui/rfc-2457/idents-normalized.rs @@ -0,0 +1,8 @@ +// check-pass +#![feature(non_ascii_idents)] + +struct Résumé; // ['LATIN SMALL LETTER E WITH ACUTE'] + +fn main() { + let _ = Résumé; // ['LATIN SMALL LETTER E', 'COMBINING ACUTE ACCENT'] +} From 27e7a1baedbcc5ddaf44f930860828dae99a7ebf Mon Sep 17 00:00:00 2001 From: Charles Lew Date: Thu, 19 Dec 2019 11:57:30 +0800 Subject: [PATCH 3/3] Add unicode-normalization to whitelist. --- Cargo.lock | 1 + src/tools/rustc-workspace-hack/Cargo.toml | 3 ++- src/tools/tidy/src/deps.rs | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 02717c85ccf..f3fad1296bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3333,6 +3333,7 @@ dependencies = [ "serde", "serde_json", "smallvec 0.6.10", + "smallvec 1.0.0", "syn 0.15.35", "url 2.1.0", "winapi 0.3.8", diff --git a/src/tools/rustc-workspace-hack/Cargo.toml b/src/tools/rustc-workspace-hack/Cargo.toml index 285af038a1e..fced6c52012 100644 --- a/src/tools/rustc-workspace-hack/Cargo.toml +++ b/src/tools/rustc-workspace-hack/Cargo.toml @@ -62,7 +62,8 @@ curl-sys = { version = "0.4.13", features = ["http2", "libnghttp2-sys"], optiona crossbeam-utils = { version = "0.6.5", features = ["nightly"] } serde = { version = "1.0.82", features = ['derive'] } serde_json = { version = "1.0.31", features = ["raw_value"] } -smallvec = { version = "0.6", features = ['union', 'may_dangle'] } +smallvec-0_6 = { package = "smallvec", version = "0.6", features = ['union', 'may_dangle'] } +smallvec = { version = "1.0", features = ['union', 'may_dangle'] } url = { version = "2.0", features = ['serde'] } syn = { version = "0.15", features = ['full'] } diff --git a/src/tools/tidy/src/deps.rs b/src/tools/tidy/src/deps.rs index cb48f723d86..a3042803dd7 100644 --- a/src/tools/tidy/src/deps.rs +++ b/src/tools/tidy/src/deps.rs @@ -170,6 +170,7 @@ const WHITELIST: &[Crate<'_>] = &[ Crate("term_size"), Crate("thread_local"), Crate("ucd-util"), + Crate("unicode-normalization"), Crate("unicode-width"), Crate("unicode-xid"), Crate("unreachable"),