Inline and remove cook_lexer_token.

This is a small performance win, alas.
This commit is contained in:
Nicholas Nethercote 2022-09-26 12:12:58 +10:00
parent da84f0f4c3
commit fb4dba0a17

View File

@ -86,13 +86,182 @@ impl<'a> StringReader<'a> {
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
match self.cook_lexer_token(token.kind, start) {
Some(kind) => {
let span = self.mk_sp(start, self.pos);
return (Token::new(kind, span), preceded_by_whitespace);
// Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
// rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
// additional validation.
let kind = match token.kind {
rustc_lexer::TokenKind::LineComment { doc_style } => {
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
preceded_by_whitespace = true;
continue;
};
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
None => preceded_by_whitespace = true,
}
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
self.report_unterminated_block_comment(start, doc_style);
}
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
preceded_by_whitespace = true;
continue;
};
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
rustc_lexer::TokenKind::Whitespace => {
preceded_by_whitespace = true;
continue;
}
rustc_lexer::TokenKind::Ident => {
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::RawIdent => {
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
if !sym.can_be_raw() {
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
}
self.sess.raw_identifier_spans.borrow_mut().push(span);
token::Ident(sym, true)
}
rustc_lexer::TokenKind::UnknownPrefix => {
self.report_unknown_prefix(start);
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like ``.
if !UNICODE_ARRAY
.iter()
.any(|&(c, _, _)| {
let sym = self.str_from(start);
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
}) =>
{
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
.push(span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
let suffix = if suffix_start < self.pos {
let string = self.str_from(suffix_start);
if string == "_" {
self.sess
.span_diagnostic
.struct_span_warn(
self.mk_sp(suffix_start, self.pos),
"underscore literal suffix is not allowed",
)
.warn(
"this was previously accepted by the compiler but is \
being phased out; it will become a hard error in \
a future release!",
)
.note(
"see issue #42326 \
<https://github.com/rust-lang/rust/issues/42326> \
for more information",
)
.emit();
None
} else {
Some(Symbol::intern(string))
}
} else {
None
};
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = self.str_from(start);
if starts_with_number {
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
}
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident)
}
rustc_lexer::TokenKind::Semi => token::Semi,
rustc_lexer::TokenKind::Comma => token::Comma,
rustc_lexer::TokenKind::Dot => token::Dot,
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::At => token::At,
rustc_lexer::TokenKind::Pound => token::Pound,
rustc_lexer::TokenKind::Tilde => token::Tilde,
rustc_lexer::TokenKind::Question => token::Question,
rustc_lexer::TokenKind::Colon => token::Colon,
rustc_lexer::TokenKind::Dollar => token::Dollar,
rustc_lexer::TokenKind::Eq => token::Eq,
rustc_lexer::TokenKind::Bang => token::Not,
rustc_lexer::TokenKind::Lt => token::Lt,
rustc_lexer::TokenKind::Gt => token::Gt,
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
rustc_lexer::TokenKind::And => token::BinOp(token::And),
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
// token. Ideally, this should be inside `rustc_lexer`. However, we should
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this
// way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
err.emit();
if let Some(token) = token {
token
} else {
preceded_by_whitespace = true;
continue;
}
}
rustc_lexer::TokenKind::Eof => token::Eof,
};
let span = self.mk_sp(start, self.pos);
return (Token::new(kind, span), preceded_by_whitespace);
}
}
@ -158,172 +327,6 @@ impl<'a> StringReader<'a> {
}
}
/// Turns simple `rustc_lexer::TokenKind` enum into a rich
/// `rustc_ast::TokenKind`. This turns strings into interned
/// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
Some(match token {
rustc_lexer::TokenKind::LineComment { doc_style } => {
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
return None;
};
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
self.report_unterminated_block_comment(start, doc_style);
}
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
return None;
};
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
rustc_lexer::TokenKind::Whitespace => return None,
rustc_lexer::TokenKind::Ident => {
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::RawIdent => {
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
if !sym.can_be_raw() {
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
}
self.sess.raw_identifier_spans.borrow_mut().push(span);
token::Ident(sym, true)
}
rustc_lexer::TokenKind::UnknownPrefix => {
self.report_unknown_prefix(start);
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like ``.
if !UNICODE_ARRAY
.iter()
.any(|&(c, _, _)| {
let sym = self.str_from(start);
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
})
=>
{
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
let suffix = if suffix_start < self.pos {
let string = self.str_from(suffix_start);
if string == "_" {
self.sess
.span_diagnostic
.struct_span_warn(
self.mk_sp(suffix_start, self.pos),
"underscore literal suffix is not allowed",
)
.warn(
"this was previously accepted by the compiler but is \
being phased out; it will become a hard error in \
a future release!",
)
.note(
"see issue #42326 \
<https://github.com/rust-lang/rust/issues/42326> \
for more information",
)
.emit();
None
} else {
Some(Symbol::intern(string))
}
} else {
None
};
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = self.str_from(start);
if starts_with_number {
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
}
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident)
}
rustc_lexer::TokenKind::Semi => token::Semi,
rustc_lexer::TokenKind::Comma => token::Comma,
rustc_lexer::TokenKind::Dot => token::Dot,
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::At => token::At,
rustc_lexer::TokenKind::Pound => token::Pound,
rustc_lexer::TokenKind::Tilde => token::Tilde,
rustc_lexer::TokenKind::Question => token::Question,
rustc_lexer::TokenKind::Colon => token::Colon,
rustc_lexer::TokenKind::Dollar => token::Dollar,
rustc_lexer::TokenKind::Eq => token::Eq,
rustc_lexer::TokenKind::Bang => token::Not,
rustc_lexer::TokenKind::Lt => token::Lt,
rustc_lexer::TokenKind::Gt => token::Gt,
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
rustc_lexer::TokenKind::And => token::BinOp(token::And),
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
// this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
err.emit();
token?
}
rustc_lexer::TokenKind::Eof => token::Eof,
})
}
fn cook_doc_comment(
&self,
content_start: BytePos,