Auto merge of #88781 - estebank:emoji-idents, r=oli-obk

Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
2025-04-13 12:36:47 +00:00 · 2021-11-25 08:16:08 +00:00 · 2021-11-25 08:16:08 +00:00 · 23a436606b
commit 23a436606b
parent c6eda7d8a7 d92916439c
13 changed files with 223 additions and 15 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4040,6 +4040,7 @@ name = "rustc_lexer"
 version = "0.1.0"
 dependencies = [
 "expect-test",
+ "unic-emoji-char",
 "unicode-xid",
 ]

@ -5510,6 +5511,47 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"

+[[package]]
+name = "unic-char-property"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
+dependencies = [
+ "unic-char-range",
+]
+
+[[package]]
+name = "unic-char-range"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
+
+[[package]]
+name = "unic-common"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
+
+[[package]]
+name = "unic-emoji-char"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
+dependencies = [
+ "unic-char-property",
+ "unic-char-range",
+ "unic-ucd-version",
+]
+
+[[package]]
+name = "unic-ucd-version"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
+dependencies = [
+ "unic-common",
+]
+
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/compiler/rustc_errors/src/emitter.rs
+++ b/compiler/rustc_errors/src/emitter.rs
@ -730,7 +730,7 @@ impl EmitterWriter {
        }

        let source_string = match file.get_line(line.line_index - 1) {
-            Some(s) => replace_tabs(&*s),
+            Some(s) => normalize_whitespace(&*s),
            None => return Vec::new(),
        };

@ -1286,7 +1286,7 @@ impl EmitterWriter {
            }
            for &(ref text, _) in msg.iter() {
                // Account for newlines to align output to its label.
-                for (line, text) in replace_tabs(text).lines().enumerate() {
+                for (line, text) in normalize_whitespace(text).lines().enumerate() {
                    buffer.append(
                        0 + line,
                        &format!(
@ -1550,7 +1550,7 @@ impl EmitterWriter {

                            self.draw_line(
                                &mut buffer,
-                                &replace_tabs(&unannotated_line),
+                                &normalize_whitespace(&unannotated_line),
                                annotated_file.lines[line_idx + 1].line_index - 1,
                                last_buffer_line_num,
                                width_offset,
@ -1672,7 +1672,7 @@ impl EmitterWriter {
                    buffer.puts(
                        row_num - 1,
                        max_line_num_len + 3,
-                        &replace_tabs(
+                        &normalize_whitespace(
                            &*file_lines
                                .file
                                .get_line(file_lines.lines[line_pos].line_index)
@ -1698,7 +1698,7 @@ impl EmitterWriter {
                }

                // print the suggestion
-                buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
+                buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);

                // Colorize addition/replacements with green.
                for &SubstitutionHighlight { start, end } in highlight_parts {
@ -2081,6 +2081,7 @@ fn num_decimal_digits(num: usize) -> usize {
 // We replace some characters so the CLI output is always consistent and underlines aligned.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
    ('\t', "    "),   // We do our own tab replacement
+    ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
    ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
    ('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk
    ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
@ -2092,7 +2093,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
    ('\u{2069}', ""),
 ];

-fn replace_tabs(str: &str) -> String {
+fn normalize_whitespace(str: &str) -> String {
    let mut s = str.to_string();
    for (c, replacement) in OUTPUT_REPLACEMENTS {
        s = s.replace(*c, replacement);
--- a/compiler/rustc_interface/src/passes.rs
+++ b/compiler/rustc_interface/src/passes.rs
@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
 use rustc_session::search_paths::PathKind;
 use rustc_session::{Limit, Session};
 use rustc_span::symbol::{sym, Ident, Symbol};
-use rustc_span::FileName;
+use rustc_span::{FileName, MultiSpan};
 use rustc_trait_selection::traits;
 use rustc_typeck as typeck;
 use tempfile::Builder as TempFileBuilder;
@ -450,6 +450,19 @@ pub fn configure_and_expand(
        });
    }

+    // Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
+    sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
+        let mut identifiers: Vec<_> = identifiers.drain().collect();
+        identifiers.sort_by_key(|&(key, _)| key);
+        for (ident, mut spans) in identifiers.into_iter() {
+            spans.sort();
+            sess.diagnostic().span_err(
+                MultiSpan::from(spans),
+                &format!("identifiers cannot contain emoji: `{}`", ident),
+            );
+        }
+    });
+
    Ok(krate)
 }

--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@ -17,6 +17,7 @@ doctest = false
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
 unicode-xid = "0.2.0"
+unic-emoji-char = "0.9.0"

 [dev-dependencies]
 expect-test = "1.0"
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -64,6 +64,8 @@ pub enum TokenKind {
    /// "ident" or "continue"
    /// At this step keywords are also considered identifiers.
    Ident,
+    /// Like the above, but containing invalid unicode codepoints.
+    InvalidIdent,
    /// "r#ident"
    RawIdent,
    /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@ -411,6 +413,10 @@ impl Cursor<'_> {
                let kind = Str { terminated };
                Literal { kind, suffix_start }
            }
+            // Identifier starting with an emoji. Only lexed for graceful error recovery.
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
            _ => Unknown,
        };
        Token::new(token_kind, self.len_consumed())
@ -492,10 +498,28 @@ impl Cursor<'_> {
        // we see a prefix here, it is definitely an unknown prefix.
        match self.first() {
            '#' | '"' | '\'' => UnknownPrefix,
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
            _ => Ident,
        }
    }

+    fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(|c| {
+            unicode_xid::UnicodeXID::is_xid_continue(c)
+                || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
+                || c == '\u{200d}'
+        });
+        // Known prefixes must have been handled earlier. So if
+        // we see a prefix here, it is definitely an unknown prefix.
+        match self.first() {
+            '#' | '"' | '\'' => UnknownPrefix,
+            _ => InvalidIdent,
+        }
+    }
+
    fn number(&mut self, first_digit: char) -> LiteralKind {
        debug_assert!('0' <= self.prev() && self.prev() <= '9');
        let mut base = Base::Decimal;
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -1,3 +1,4 @@
+use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
                }
                token::Ident(sym, is_raw_ident)
            }
+            rustc_lexer::TokenKind::InvalidIdent
+                // Do not recover an identifier with emoji if the codepoint is a confusable
+                // with a recoverable substitution token, like `➖`.
+                if UNICODE_ARRAY
+                    .iter()
+                    .find(|&&(c, _, _)| {
+                        let sym = self.str_from(start);
+                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
+                    })
+                    .is_none() =>
+            {
+                let sym = nfc_normalize(self.str_from(start));
+                let span = self.mk_sp(start, self.pos);
+                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
+                token::Ident(sym, false)
+            }
            rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                let suffix_start = start + BytePos(suffix_start as u32);
                let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
            rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
            rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

-            rustc_lexer::TokenKind::Unknown => {
+            rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
                let c = self.str_from(start).chars().next().unwrap();
                let mut err =
                    self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
 use rustc_span::{symbol::kw, BytePos, Pos, Span};

 #[rustfmt::skip] // for line breaks
-const UNICODE_ARRAY: &[(char, &str, char)] = &[
+pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
    (' ', "Line Separator", ' '),
    (' ', "Paragraph Separator", ' '),
    (' ', "Ogham Space mark", ' '),
--- a/compiler/rustc_session/src/parse.rs
+++ b/compiler/rustc_session/src/parse.rs
@ -119,8 +119,13 @@ pub struct ParseSess {
    pub config: CrateConfig,
    pub edition: Edition,
    pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
-    /// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
+    /// Places where raw identifiers were used. This is used to avoid complaining about idents
+    /// clashing with keywords in new editions.
    pub raw_identifier_spans: Lock<Vec<Span>>,
+    /// Places where identifiers that contain invalid Unicode codepoints but that look like they
+    /// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
+    /// provide a single error per unique incorrect identifier.
+    pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
    source_map: Lrc<SourceMap>,
    pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
    /// Contains the spans of block expressions that could have been incomplete based on the
@ -160,6 +165,7 @@ impl ParseSess {
            edition: ExpnId::root().expn_data().edition,
            missing_fragment_specifiers: Default::default(),
            raw_identifier_spans: Lock::new(Vec::new()),
+            bad_unicode_identifiers: Lock::new(Default::default()),
            source_map,
            buffered_lints: Lock::new(vec![]),
            ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@ -632,7 +632,7 @@ impl<'a> Classifier<'a> {
                },
                Some(c) => c,
            },
-            TokenKind::RawIdent | TokenKind::UnknownPrefix => {
+            TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
                Class::Ident(self.new_span(before, text))
            }
            TokenKind::Lifetime { .. } => Class::Lifetime,
--- a/src/test/ui/parser/emoji-identifiers.rs
+++ b/src/test/ui/parser/emoji-identifiers.rs
@ -0,0 +1,16 @@
+struct ABig👩‍👩‍👧‍👧Family; //~ ERROR identifiers cannot contain emoji
+struct 👀; //~ ERROR identifiers cannot contain emoji
+impl 👀 {
+    fn full_of_✨() -> 👀 { //~ ERROR identifiers cannot contain emoji
+        👀
+    }
+}
+fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emoji
+    👀::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
+    //~^ ERROR identifiers cannot contain emoji
+}
+fn main() {
+    let _ = i_like_to_😄_a_lot() ➖ 4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
+    //~^ ERROR identifiers cannot contain emoji
+    //~| ERROR unknown start of token: \u{2796}
+}
--- a/src/test/ui/parser/emoji-identifiers.stderr
+++ b/src/test/ui/parser/emoji-identifiers.stderr
@ -0,0 +1,83 @@
+error: unknown start of token: \u{2796}
+  --> $DIR/emoji-identifiers.rs:13:33
+   |
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |                                  ^^
+   |
+help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
+   |
+LL |     let _ = i_like_to_😄_a_lot() - 4;
+   |                                  ~
+
+error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   | ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
+...
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |             ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
+
+error: identifiers cannot contain emoji: `ABig👩👩👧👧Family`
+  --> $DIR/emoji-identifiers.rs:1:8
+   |
+LL | struct ABig👩👩👧👧Family;
+   |        ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `👀`
+  --> $DIR/emoji-identifiers.rs:2:8
+   |
+LL | struct 👀;
+   |        ^^
+LL | impl 👀 {
+   |      ^^
+LL |     fn full_of_✨() -> 👀 {
+   |                        ^^
+LL |         👀
+   |         ^^
+...
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   |                            ^^
+LL |     👀::full_of✨()
+   |     ^^
+
+error: identifiers cannot contain emoji: `full_of_✨`
+  --> $DIR/emoji-identifiers.rs:4:8
+   |
+LL |     fn full_of_✨() -> 👀 {
+   |        ^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `i_like_to_😅_a_lot`
+  --> $DIR/emoji-identifiers.rs:8:4
+   |
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   |    ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `full_of✨`
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL |     👀::full_of✨()
+   |         ^^^^^^^^^
+
+error: identifiers cannot contain emoji: `i_like_to_😄_a_lot`
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |             ^^^^^^^^^^^^^^^^^^
+
+error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL | struct 👀;
+   | ---------- function or associated item `full_of✨` not found for this
+...
+LL |     👀::full_of✨()
+   |         ^^^^^^^^^
+   |         |
+   |         function or associated item not found in `👀`
+   |         help: there is an associated function with a similar name: `full_of_✨`
+
+error: aborting due to 9 previous errors
+
+Some errors have detailed explanations: E0425, E0599.
+For more information about an error, try `rustc --explain E0425`.
--- a/src/tools/cargo
+++ b/src/tools/cargo
@ -1 +1 @@
-Subproject commit e1fb17631eb1b3665cdbe45b1c186111577ef512
+Subproject commit 7f08ace4f1305de7f3b1b0e2f765911957226bd4
--- a/src/tools/tidy/src/deps.rs
+++ b/src/tools/tidy/src/deps.rs
@ -82,8 +82,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
    "bitflags",
    "block-buffer",
    "block-padding",
-    "byteorder",
    "byte-tools",
+    "byteorder",
    "cc",
    "cfg-if",
    "chalk-derive",
@ -140,9 +140,9 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
    "memmap2",
    "memoffset",
    "miniz_oxide",
-    "num_cpus",
    "num-integer",
    "num-traits",
+    "num_cpus",
    "object",
    "odht",
    "once_cell",
@ -190,8 +190,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
    "serde_json",
    "sha-1",
    "sha2",
-    "smallvec",
    "sharded-slab",
+    "smallvec",
    "snap",
    "stable_deref_trait",
    "stacker",
@ -211,6 +211,11 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
    "tracing-subscriber",
    "tracing-tree",
    "typenum",
+    "unic-char-property",
+    "unic-char-range",
+    "unic-common",
+    "unic-emoji-char",
+    "unic-ucd-version",
    "unicode-normalization",
    "unicode-script",
    "unicode-security",