From e9a0c3c98c5640070e15a3cb38860a7268c1dca2 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 19 Nov 2024 15:55:34 +1100 Subject: [PATCH] Remove `TokenKind::InvalidPrefix`. It was added in #123752 to handle some cases involving emoji, but it isn't necessary because it's always treated the same as `TokenKind::InvalidIdent`. This commit removes it, which makes things a little simpler. --- compiler/rustc_lexer/src/lib.rs | 21 +++++++------------ compiler/rustc_parse/src/lexer/mod.rs | 5 ++--- src/librustdoc/html/highlight.rs | 7 +++---- .../crates/parser/src/lexed_str.rs | 2 +- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index c01dad810c4..bcb103957ba 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -99,10 +99,6 @@ pub enum TokenKind { /// several tokens: `'r` and `#` and `foo`. RawLifetime, - /// Similar to the above, but *always* an error on every edition. This is used - /// for emoji identifier recovery, as those are not meant to be ever accepted. - InvalidPrefix, - /// Guarded string literal prefix: `#"` or `##`. /// /// Used for reserving "guarded strings" (RFC 3598) in edition 2024. @@ -466,7 +462,7 @@ impl Cursor<'_> { Literal { kind, suffix_start } } // Identifier starting with an emoji. Only lexed for graceful error recovery. - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Unknown, }; let res = Token::new(token_kind, self.pos_within_token()); @@ -550,23 +546,22 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Ident, } } - fn invalid_ident_or_prefix(&mut self) -> TokenKind { + fn invalid_ident(&mut self) -> TokenKind { // Start is already eaten, eat the rest of identifier. self.eat_while(|c| { const ZERO_WIDTH_JOINER: char = '\u{200d}'; is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER }); - // Known prefixes must have been handled earlier. So if - // we see a prefix here, it is definitely an unknown prefix. - match self.first() { - '#' | '"' | '\'' => InvalidPrefix, - _ => InvalidIdent, - } + // An invalid identifier followed by '#' or '"' or '\'' could be + // interpreted as an invalid literal prefix. We don't bother doing that + // because the treatment of invalid identifiers and invalid prefixes + // would be the same. + InvalidIdent } fn c_or_byte_string( diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 226de65445c..5023e83bd67 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { let ident = Symbol::intern(lifetime_name); token::Lifetime(ident, IdentIsRaw::No) } - rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix + rustc_lexer::TokenKind::InvalidIdent // Do not recover an identifier with emoji if the codepoint is a confusable // with a recoverable substitution token, like `➖`. if !UNICODE_ARRAY.iter().any(|&(c, _, _)| { @@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), rustc_lexer::TokenKind::Unknown - | rustc_lexer::TokenKind::InvalidIdent - | rustc_lexer::TokenKind::InvalidPrefix => { + | rustc_lexer::TokenKind::InvalidIdent => { // Don't emit diagnostics for sequences of the same invalid token if swallow_next_invalid > 0 { swallow_next_invalid -= 1; diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 4def80764ea..29f6f92a6b2 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -861,10 +861,9 @@ impl<'src> Classifier<'src> { }, Some(c) => c, }, - TokenKind::RawIdent - | TokenKind::UnknownPrefix - | TokenKind::InvalidPrefix - | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)), + TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { + Class::Ident(self.new_span(before, text)) + } TokenKind::Lifetime { .. } | TokenKind::RawLifetime | TokenKind::UnknownPrefixLifetime => Class::Lifetime, diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index 3c0eb1b42a6..c97596d5097 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -183,7 +183,7 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Ident => { SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) } - rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::InvalidIdent => { err = "Ident contains invalid characters"; IDENT }