Rollup merge of #106566 - clubby789:contiguous-weird-unicode, r=cjgillot

Emit a single error for contiguous sequences of unknown tokens

Closes #106101

On encountering a sequence of identical source characters which are unknown tokens, note the amount of subsequent characters and advance past them silently. The old behavior was to emit an error and 'help' note for every single one.

`@rustbot` label +A-diagnostics +A-parser
This commit is contained in:
Matthias Krüger 2023-01-14 13:04:24 +01:00 committed by GitHub
commit 8e0eecdba6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 49 additions and 9 deletions

View File

@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
loop {
let token = self.cursor.advance_token();
@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
continue;
}
let mut it = self.str_from_to_end(start).chars();
let c = it.next().unwrap();
let repeats = it.take_while(|c1| *c1 == c).count();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
// token. Ideally, this should be inside `rustc_lexer`. However, we should
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this
// way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
if repeats > 0 {
if repeats == 1 {
err.note(format!("character appears once more"));
} else {
err.note(format!("character appears {repeats} more times"));
}
swallow_next_invalid = repeats;
}
err.emit();
if let Some(token) = token {
token
@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}
/// Slice of the source text spanning from `start` until the end
fn str_from_to_end(&self, start: BytePos) -> &str {
&self.src[self.src_index(start)..]
}
fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
Err(RawStrError::InvalidStarter { bad_char }) => {

View File

@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
pos: BytePos,
ch: char,
err: &mut Diagnostic,
count: usize,
) -> Option<token::TokenKind> {
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
let msg = format!("substitution character not found for '{}'", ch);
@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
ch, u_name, ascii_char, ascii_name
);
err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
err.span_suggestion(
span,
&msg,
ascii_char.to_string().repeat(count),
Applicability::MaybeIncorrect,
);
}
token.clone()
}

View File

@ -77,8 +77,6 @@ LL | /// ```
| ^^^
|
= note: error from rustc: unknown start of token: `
= note: error from rustc: unknown start of token: `
= note: error from rustc: unknown start of token: `
warning: could not parse code block as Rust code
--> $DIR/invalid-syntax.rs:64:5

View File

@ -2,4 +2,8 @@ fn main() {
let y = 0;
//~^ ERROR unknown start of token: \u{37e}
//~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
    let x = 0;
//~^ ERROR unknown start of token: \u{a0}
//~^^ NOTE character appears 3 more times
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
}

View File

@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
LL | let y = 0;
| ~
error: aborting due to previous error
error: unknown start of token: \u{a0}
--> $DIR/unicode-chars.rs:5:5
|
LL |     let x = 0;
| ^^^^
|
= note: character appears 3 more times
help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
|
LL | let x = 0;
| ++++
error: aborting due to 2 previous errors