From fb4dba0a17b6eb241b4fd3732b976ab38fe2cdc0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 12:12:58 +1000 Subject: [PATCH] Inline and remove `cook_lexer_token`. This is a small performance win, alas. --- compiler/rustc_parse/src/lexer/mod.rs | 347 +++++++++++++------------- 1 file changed, 175 insertions(+), 172 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 67fefd19d8b..151e80e2b3e 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -86,13 +86,182 @@ impl<'a> StringReader<'a> { debug!("next_token: {:?}({:?})", token.kind, self.str_from(start)); - match self.cook_lexer_token(token.kind, start) { - Some(kind) => { - let span = self.mk_sp(start, self.pos); - return (Token::new(kind, span), preceded_by_whitespace); + // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a + // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs + // additional validation. + let kind = match token.kind { + rustc_lexer::TokenKind::LineComment { doc_style } => { + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 is not included into the symbol. + let content_start = start + BytePos(3); + let content = self.str_from(content_start); + self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) } - None => preceded_by_whitespace = true, - } + rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { + if !terminated { + self.report_unterminated_block_comment(start, doc_style); + } + + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 and closing delimiter of the length 2 + // are not included into the symbol. + let content_start = start + BytePos(3); + let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); + let content = self.str_from_to(content_start, content_end); + self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) + } + rustc_lexer::TokenKind::Whitespace => { + preceded_by_whitespace = true; + continue; + } + rustc_lexer::TokenKind::Ident => { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::RawIdent => { + let sym = nfc_normalize(self.str_from(start + BytePos(2))); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + if !sym.can_be_raw() { + self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); + } + self.sess.raw_identifier_spans.borrow_mut().push(span); + token::Ident(sym, true) + } + rustc_lexer::TokenKind::UnknownPrefix => { + self.report_unknown_prefix(start); + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emoji if the codepoint is a confusable + // with a recoverable substitution token, like `➖`. + if !UNICODE_ARRAY + .iter() + .any(|&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) => + { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default() + .push(span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::Literal { kind, suffix_start } => { + let suffix_start = start + BytePos(suffix_start); + let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); + let suffix = if suffix_start < self.pos { + let string = self.str_from(suffix_start); + if string == "_" { + self.sess + .span_diagnostic + .struct_span_warn( + self.mk_sp(suffix_start, self.pos), + "underscore literal suffix is not allowed", + ) + .warn( + "this was previously accepted by the compiler but is \ + being phased out; it will become a hard error in \ + a future release!", + ) + .note( + "see issue #42326 \ + \ + for more information", + ) + .emit(); + None + } else { + Some(Symbol::intern(string)) + } + } else { + None + }; + token::Literal(token::Lit { kind, symbol, suffix }) + } + rustc_lexer::TokenKind::Lifetime { starts_with_number } => { + // Include the leading `'` in the real identifier, for macro + // expansion purposes. See #12512 for the gory details of why + // this is necessary. + let lifetime_name = self.str_from(start); + if starts_with_number { + self.err_span_(start, self.pos, "lifetimes cannot start with a number"); + } + let ident = Symbol::intern(lifetime_name); + token::Lifetime(ident) + } + rustc_lexer::TokenKind::Semi => token::Semi, + rustc_lexer::TokenKind::Comma => token::Comma, + rustc_lexer::TokenKind::Dot => token::Dot, + rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), + rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), + rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::At => token::At, + rustc_lexer::TokenKind::Pound => token::Pound, + rustc_lexer::TokenKind::Tilde => token::Tilde, + rustc_lexer::TokenKind::Question => token::Question, + rustc_lexer::TokenKind::Colon => token::Colon, + rustc_lexer::TokenKind::Dollar => token::Dollar, + rustc_lexer::TokenKind::Eq => token::Eq, + rustc_lexer::TokenKind::Bang => token::Not, + rustc_lexer::TokenKind::Lt => token::Lt, + rustc_lexer::TokenKind::Gt => token::Gt, + rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), + rustc_lexer::TokenKind::And => token::BinOp(token::And), + rustc_lexer::TokenKind::Or => token::BinOp(token::Or), + rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), + rustc_lexer::TokenKind::Star => token::BinOp(token::Star), + rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), + rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), + rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), + + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { + let c = self.str_from(start).chars().next().unwrap(); + let mut err = + self.struct_err_span_char(start, self.pos, "unknown start of token", c); + // FIXME: the lexer could be used to turn the ASCII version of unicode + // homoglyphs, instead of keeping a table in `check_for_substitution`into the + // token. Ideally, this should be inside `rustc_lexer`. However, we should + // first remove compound tokens like `<<` from `rustc_lexer`, and then add + // fancier error recovery to it, as there will be less overall work to do this + // way. + let token = unicode_chars::check_for_substitution(self, start, c, &mut err); + if c == '\x00' { + err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); + } + err.emit(); + if let Some(token) = token { + token + } else { + preceded_by_whitespace = true; + continue; + } + } + rustc_lexer::TokenKind::Eof => token::Eof, + }; + let span = self.mk_sp(start, self.pos); + return (Token::new(kind, span), preceded_by_whitespace); } } @@ -158,172 +327,6 @@ impl<'a> StringReader<'a> { } } - /// Turns simple `rustc_lexer::TokenKind` enum into a rich - /// `rustc_ast::TokenKind`. This turns strings into interned - /// symbols and runs additional validation. - fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option { - Some(match token { - rustc_lexer::TokenKind::LineComment { doc_style } => { - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 is not included into the symbol. - let content_start = start + BytePos(3); - let content = self.str_from(content_start); - self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) - } - rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { - if !terminated { - self.report_unterminated_block_comment(start, doc_style); - } - - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 and closing delimiter of the length 2 - // are not included into the symbol. - let content_start = start + BytePos(3); - let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); - let content = self.str_from_to(content_start, content_end); - self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) - } - rustc_lexer::TokenKind::Whitespace => return None, - rustc_lexer::TokenKind::Ident => { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::RawIdent => { - let sym = nfc_normalize(self.str_from(start + BytePos(2))); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - if !sym.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); - token::Ident(sym, true) - } - rustc_lexer::TokenKind::UnknownPrefix => { - self.report_unknown_prefix(start); - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::InvalidIdent - // Do not recover an identifier with emoji if the codepoint is a confusable - // with a recoverable substitution token, like `➖`. - if !UNICODE_ARRAY - .iter() - .any(|&(c, _, _)| { - let sym = self.str_from(start); - sym.chars().count() == 1 && c == sym.chars().next().unwrap() - }) - => - { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::Literal { kind, suffix_start } => { - let suffix_start = start + BytePos(suffix_start); - let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); - let suffix = if suffix_start < self.pos { - let string = self.str_from(suffix_start); - if string == "_" { - self.sess - .span_diagnostic - .struct_span_warn( - self.mk_sp(suffix_start, self.pos), - "underscore literal suffix is not allowed", - ) - .warn( - "this was previously accepted by the compiler but is \ - being phased out; it will become a hard error in \ - a future release!", - ) - .note( - "see issue #42326 \ - \ - for more information", - ) - .emit(); - None - } else { - Some(Symbol::intern(string)) - } - } else { - None - }; - token::Literal(token::Lit { kind, symbol, suffix }) - } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - // Include the leading `'` in the real identifier, for macro - // expansion purposes. See #12512 for the gory details of why - // this is necessary. - let lifetime_name = self.str_from(start); - if starts_with_number { - self.err_span_(start, self.pos, "lifetimes cannot start with a number"); - } - let ident = Symbol::intern(lifetime_name); - token::Lifetime(ident) - } - rustc_lexer::TokenKind::Semi => token::Semi, - rustc_lexer::TokenKind::Comma => token::Comma, - rustc_lexer::TokenKind::Dot => token::Dot, - rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), - rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), - rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::At => token::At, - rustc_lexer::TokenKind::Pound => token::Pound, - rustc_lexer::TokenKind::Tilde => token::Tilde, - rustc_lexer::TokenKind::Question => token::Question, - rustc_lexer::TokenKind::Colon => token::Colon, - rustc_lexer::TokenKind::Dollar => token::Dollar, - rustc_lexer::TokenKind::Eq => token::Eq, - rustc_lexer::TokenKind::Bang => token::Not, - rustc_lexer::TokenKind::Lt => token::Lt, - rustc_lexer::TokenKind::Gt => token::Gt, - rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), - rustc_lexer::TokenKind::And => token::BinOp(token::And), - rustc_lexer::TokenKind::Or => token::BinOp(token::Or), - rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), - rustc_lexer::TokenKind::Star => token::BinOp(token::Star), - rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), - rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), - rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - - rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { - let c = self.str_from(start).chars().next().unwrap(); - let mut err = - self.struct_err_span_char(start, self.pos, "unknown start of token", c); - // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, - // instead of keeping a table in `check_for_substitution`into the token. Ideally, - // this should be inside `rustc_lexer`. However, we should first remove compound - // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, - // as there will be less overall work to do this way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err); - if c == '\x00' { - err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); - } - err.emit(); - token? - } - rustc_lexer::TokenKind::Eof => token::Eof, - }) - } - fn cook_doc_comment( &self, content_start: BytePos,