From a3d6bc34684f67b9b5b2a419d57dc0afe5af4a67 Mon Sep 17 00:00:00 2001
From: clubby789 <jamie@hill-daniel.co.uk>
Date: Sat, 7 Jan 2023 16:33:05 +0000
Subject: [PATCH] Emit a single error for contiguous sequences of Unicode
 homoglyphs

---
 compiler/rustc_parse/src/lexer/mod.rs         |  28 +++++++++++++++---
 .../rustc_parse/src/lexer/unicode_chars.rs    |  10 +++++--
 tests/rustdoc-ui/invalid-syntax.stderr        |   2 --
 tests/ui/parser/issues/issue-66473.stderr     | Bin 5260 -> 1061 bytes
 tests/ui/parser/issues/issue-68629.stderr     | Bin 1831 -> 1637 bytes
 tests/ui/parser/issues/issue-68730.stderr     | Bin 1226 -> 1266 bytes
 tests/ui/parser/unicode-chars.rs              |   4 +++
 tests/ui/parser/unicode-chars.stderr          |  14 ++++++++-
 8 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index f027843e6b4..8761c23625b 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
     /// preceded by whitespace.
     fn next_token(&mut self) -> (Token, bool) {
         let mut preceded_by_whitespace = false;
-
+        let mut swallow_next_invalid = 0;
         // Skip trivial (whitespace & comments) tokens
         loop {
             let token = self.cursor.advance_token();
@@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
                 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
                 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
-                    let c = self.str_from(start).chars().next().unwrap();
+                    // Don't emit diagnostics for sequences of the same invalid token
+                    if swallow_next_invalid > 0 {
+                        swallow_next_invalid -= 1;
+                        continue;
+                    }
+                    let mut it = self.str_from_to_end(start).chars();
+                    let c = it.next().unwrap();
+                    let repeats = it.take_while(|c1| *c1 == c).count();
                     let mut err =
-                        self.struct_err_span_char(start, self.pos, "unknown start of token", c);
+                        self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
                     // FIXME: the lexer could be used to turn the ASCII version of unicode
                     // homoglyphs, instead of keeping a table in `check_for_substitution`into the
                     // token. Ideally, this should be inside `rustc_lexer`. However, we should
                     // first remove compound tokens like `<<` from `rustc_lexer`, and then add
                     // fancier error recovery to it, as there will be less overall work to do this
                     // way.
-                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
+                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
                     if c == '\x00' {
                         err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
                     }
+                    if repeats > 0 {
+                        if repeats == 1 {
+                            err.note(format!("character appears once more"));
+                        } else {
+                            err.note(format!("character appears {repeats} more times"));
+                        }
+                        swallow_next_invalid = repeats;
+                    }
                     err.emit();
                     if let Some(token) = token {
                         token
@@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
+    /// Slice of the source text spanning from `start` until the end
+    fn str_from_to_end(&self, start: BytePos) -> &str {
+        &self.src[self.src_index(start)..]
+    }
+
     fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
         match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
             Err(RawStrError::InvalidStarter { bad_char }) => {
diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs
index f1b50296e25..65479b341d7 100644
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
     pos: BytePos,
     ch: char,
     err: &mut Diagnostic,
+    count: usize,
 ) -> Option<token::TokenKind> {
     let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
 
-    let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
+    let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
 
     let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
         let msg = format!("substitution character not found for '{}'", ch);
@@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
             "Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
             ch, u_name, ascii_char, ascii_name
         );
-        err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
+        err.span_suggestion(
+            span,
+            &msg,
+            ascii_char.to_string().repeat(count),
+            Applicability::MaybeIncorrect,
+        );
     }
     token.clone()
 }
diff --git a/tests/rustdoc-ui/invalid-syntax.stderr b/tests/rustdoc-ui/invalid-syntax.stderr
index 597d19e748c..6140a06c555 100644
--- a/tests/rustdoc-ui/invalid-syntax.stderr
+++ b/tests/rustdoc-ui/invalid-syntax.stderr
@@ -77,8 +77,6 @@ LL | ///     ```
    |         ^^^
    |
    = note: error from rustc: unknown start of token: `
-   = note: error from rustc: unknown start of token: `
-   = note: error from rustc: unknown start of token: `
 
 warning: could not parse code block as Rust code
   --> $DIR/invalid-syntax.rs:64:5
diff --git a/tests/ui/parser/issues/issue-66473.stderr b/tests/ui/parser/issues/issue-66473.stderr
index 8a16d7f955129811997464d47c4e10238db77340..0e8b0a5da220569b607fe3512ae59ee94e3fcf1b 100644
GIT binary patch
delta 122
zcmeCtT*|Q_no&zZLBUobFTW(!N+CHTu_!UQB(+E(v7jI|v8Y(V&|D!mzbI9qBr`X)
wc=7~6m(AsZ;f!i{_2lOzr-F4&UL$-)*i50ID77pzzqD8(wWuh+sF;fj0L7&#-~a#s

delta 252
zcmZ3=(WAK`nsKrtlLNDfmC3{ntD#I&1k((`G)FKk5KK!blf}f!&;W^Th{QHRC^wuu
zky&Hn^%js#AT2OkL2Q_{AU4ci5F2JOhz+wD#745(2x_(R=9P@)ARjp~noVwDnlss*
b`Lu|Ui9$h9YFTD}X|Y0TQBi(TF&7sAkD^jK

diff --git a/tests/ui/parser/issues/issue-68629.stderr b/tests/ui/parser/issues/issue-68629.stderr
index b2c7dddc8011c21e1f3d21185bc48a97dbadf79c..43a903e6c4698e59840cdf18f73ed830c8b50c32 100644
GIT binary patch
delta 75
zcmZ3^_mpQtH<Oxzf`Y9=UVcfcl|phxVo_ppNotWoVnIP_Vo|X|eqM5_LT-Lh>SR}z
f+nfDZjxh;aC=?W>mSyIb7AvF{73CKdb8!Ix<I);s

delta 50
zcmaFLvz%{3H`C-rj82S3lNYkeOwMO!o6OGCvUwBJ0%l=Lg@U5gvdsL_VujSAqWq#_
GE-nDJ;1JUQ

diff --git a/tests/ui/parser/issues/issue-68730.stderr b/tests/ui/parser/issues/issue-68730.stderr
index 6585a19d954efd39c15e3d7bd77ecb26e9796ae8..5bca5bbebeacb44984c8679e4d2c4757be4ee742 100644
GIT binary patch
delta 87
zcmX@b`H6Fb6_aipmx6*q4T!K+$jdKDwNgmVNGwWBE=es?NGvEwO)M%_$j?hoRmjaR
iN}cS$+#q0M1J(l60RnY(lP@s3PPSoD-)zm?&j<jNTpAbv

delta 41
tcmeywd5Uv`71LxtMz+bB%=H2`r<=GG6clQJ7zFAjUto0IY{R^b5daTK3{U_7

diff --git a/tests/ui/parser/unicode-chars.rs b/tests/ui/parser/unicode-chars.rs
index 89ae85ec990..ba35e95c82a 100644
--- a/tests/ui/parser/unicode-chars.rs
+++ b/tests/ui/parser/unicode-chars.rs
@@ -2,4 +2,8 @@ fn main() {
     let y = 0;
     //~^ ERROR unknown start of token: \u{37e}
     //~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
+        let x = 0;
+    //~^ ERROR unknown start of token: \u{a0}
+    //~^^ NOTE character appears 3 more times
+    //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
 }
diff --git a/tests/ui/parser/unicode-chars.stderr b/tests/ui/parser/unicode-chars.stderr
index 0cfe9240e85..6a5b27872e7 100644
--- a/tests/ui/parser/unicode-chars.stderr
+++ b/tests/ui/parser/unicode-chars.stderr
@@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
 LL |     let y = 0;
    |              ~
 
-error: aborting due to previous error
+error: unknown start of token: \u{a0}
+  --> $DIR/unicode-chars.rs:5:5
+   |
+LL |         let x = 0;
+   |     ^^^^
+   |
+   = note: character appears 3 more times
+help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
+   |
+LL |         let x = 0;
+   |     ++++
+
+error: aborting due to 2 previous errors