mirror of
https://github.com/rust-lang/rust.git
synced 2025-04-13 12:36:47 +00:00
Auto merge of #88781 - estebank:emoji-idents, r=oli-obk
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
This commit is contained in:
commit
23a436606b
42
Cargo.lock
42
Cargo.lock
@ -4040,6 +4040,7 @@ name = "rustc_lexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"expect-test",
|
||||
"unic-emoji-char",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
@ -5510,6 +5511,47 @@ version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-property"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
|
||||
dependencies = [
|
||||
"unic-char-range",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-range"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-common"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-emoji-char"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
|
||||
dependencies = [
|
||||
"unic-char-property",
|
||||
"unic-char-range",
|
||||
"unic-ucd-version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-ucd-version"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
|
||||
dependencies = [
|
||||
"unic-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
|
@ -730,7 +730,7 @@ impl EmitterWriter {
|
||||
}
|
||||
|
||||
let source_string = match file.get_line(line.line_index - 1) {
|
||||
Some(s) => replace_tabs(&*s),
|
||||
Some(s) => normalize_whitespace(&*s),
|
||||
None => return Vec::new(),
|
||||
};
|
||||
|
||||
@ -1286,7 +1286,7 @@ impl EmitterWriter {
|
||||
}
|
||||
for &(ref text, _) in msg.iter() {
|
||||
// Account for newlines to align output to its label.
|
||||
for (line, text) in replace_tabs(text).lines().enumerate() {
|
||||
for (line, text) in normalize_whitespace(text).lines().enumerate() {
|
||||
buffer.append(
|
||||
0 + line,
|
||||
&format!(
|
||||
@ -1550,7 +1550,7 @@ impl EmitterWriter {
|
||||
|
||||
self.draw_line(
|
||||
&mut buffer,
|
||||
&replace_tabs(&unannotated_line),
|
||||
&normalize_whitespace(&unannotated_line),
|
||||
annotated_file.lines[line_idx + 1].line_index - 1,
|
||||
last_buffer_line_num,
|
||||
width_offset,
|
||||
@ -1672,7 +1672,7 @@ impl EmitterWriter {
|
||||
buffer.puts(
|
||||
row_num - 1,
|
||||
max_line_num_len + 3,
|
||||
&replace_tabs(
|
||||
&normalize_whitespace(
|
||||
&*file_lines
|
||||
.file
|
||||
.get_line(file_lines.lines[line_pos].line_index)
|
||||
@ -1698,7 +1698,7 @@ impl EmitterWriter {
|
||||
}
|
||||
|
||||
// print the suggestion
|
||||
buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
|
||||
buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);
|
||||
|
||||
// Colorize addition/replacements with green.
|
||||
for &SubstitutionHighlight { start, end } in highlight_parts {
|
||||
@ -2081,6 +2081,7 @@ fn num_decimal_digits(num: usize) -> usize {
|
||||
// We replace some characters so the CLI output is always consistent and underlines aligned.
|
||||
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
|
||||
('\t', " "), // We do our own tab replacement
|
||||
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
|
||||
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
|
||||
('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk
|
||||
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
|
||||
@ -2092,7 +2093,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
|
||||
('\u{2069}', ""),
|
||||
];
|
||||
|
||||
fn replace_tabs(str: &str) -> String {
|
||||
fn normalize_whitespace(str: &str) -> String {
|
||||
let mut s = str.to_string();
|
||||
for (c, replacement) in OUTPUT_REPLACEMENTS {
|
||||
s = s.replace(*c, replacement);
|
||||
|
@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
|
||||
use rustc_session::search_paths::PathKind;
|
||||
use rustc_session::{Limit, Session};
|
||||
use rustc_span::symbol::{sym, Ident, Symbol};
|
||||
use rustc_span::FileName;
|
||||
use rustc_span::{FileName, MultiSpan};
|
||||
use rustc_trait_selection::traits;
|
||||
use rustc_typeck as typeck;
|
||||
use tempfile::Builder as TempFileBuilder;
|
||||
@ -450,6 +450,19 @@ pub fn configure_and_expand(
|
||||
});
|
||||
}
|
||||
|
||||
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
|
||||
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
|
||||
let mut identifiers: Vec<_> = identifiers.drain().collect();
|
||||
identifiers.sort_by_key(|&(key, _)| key);
|
||||
for (ident, mut spans) in identifiers.into_iter() {
|
||||
spans.sort();
|
||||
sess.diagnostic().span_err(
|
||||
MultiSpan::from(spans),
|
||||
&format!("identifiers cannot contain emoji: `{}`", ident),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(krate)
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@ doctest = false
|
||||
# Note that this crate purposefully does not depend on other rustc crates
|
||||
[dependencies]
|
||||
unicode-xid = "0.2.0"
|
||||
unic-emoji-char = "0.9.0"
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.0"
|
||||
|
@ -64,6 +64,8 @@ pub enum TokenKind {
|
||||
/// "ident" or "continue"
|
||||
/// At this step keywords are also considered identifiers.
|
||||
Ident,
|
||||
/// Like the above, but containing invalid unicode codepoints.
|
||||
InvalidIdent,
|
||||
/// "r#ident"
|
||||
RawIdent,
|
||||
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
|
||||
@ -411,6 +413,10 @@ impl Cursor<'_> {
|
||||
let kind = Str { terminated };
|
||||
Literal { kind, suffix_start }
|
||||
}
|
||||
// Identifier starting with an emoji. Only lexed for graceful error recovery.
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
_ => Unknown,
|
||||
};
|
||||
Token::new(token_kind, self.len_consumed())
|
||||
@ -492,10 +498,28 @@ impl Cursor<'_> {
|
||||
// we see a prefix here, it is definitely an unknown prefix.
|
||||
match self.first() {
|
||||
'#' | '"' | '\'' => UnknownPrefix,
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
_ => Ident,
|
||||
}
|
||||
}
|
||||
|
||||
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
|
||||
// Start is already eaten, eat the rest of identifier.
|
||||
self.eat_while(|c| {
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|
||||
|| c == '\u{200d}'
|
||||
});
|
||||
// Known prefixes must have been handled earlier. So if
|
||||
// we see a prefix here, it is definitely an unknown prefix.
|
||||
match self.first() {
|
||||
'#' | '"' | '\'' => UnknownPrefix,
|
||||
_ => InvalidIdent,
|
||||
}
|
||||
}
|
||||
|
||||
fn number(&mut self, first_digit: char) -> LiteralKind {
|
||||
debug_assert!('0' <= self.prev() && self.prev() <= '9');
|
||||
let mut base = Base::Decimal;
|
||||
|
@ -1,3 +1,4 @@
|
||||
use crate::lexer::unicode_chars::UNICODE_ARRAY;
|
||||
use rustc_ast::ast::{self, AttrStyle};
|
||||
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
|
||||
use rustc_ast::tokenstream::{Spacing, TokenStream};
|
||||
@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
token::Ident(sym, is_raw_ident)
|
||||
}
|
||||
rustc_lexer::TokenKind::InvalidIdent
|
||||
// Do not recover an identifier with emoji if the codepoint is a confusable
|
||||
// with a recoverable substitution token, like `➖`.
|
||||
if UNICODE_ARRAY
|
||||
.iter()
|
||||
.find(|&&(c, _, _)| {
|
||||
let sym = self.str_from(start);
|
||||
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
|
||||
})
|
||||
.is_none() =>
|
||||
{
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
|
||||
let suffix_start = start + BytePos(suffix_start as u32);
|
||||
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
|
||||
@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
|
||||
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
|
||||
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
|
||||
|
||||
rustc_lexer::TokenKind::Unknown => {
|
||||
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
|
||||
let c = self.str_from(start).chars().next().unwrap();
|
||||
let mut err =
|
||||
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
|
||||
|
@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
|
||||
use rustc_span::{symbol::kw, BytePos, Pos, Span};
|
||||
|
||||
#[rustfmt::skip] // for line breaks
|
||||
const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
||||
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
||||
('
', "Line Separator", ' '),
|
||||
('
', "Paragraph Separator", ' '),
|
||||
(' ', "Ogham Space mark", ' '),
|
||||
|
@ -119,8 +119,13 @@ pub struct ParseSess {
|
||||
pub config: CrateConfig,
|
||||
pub edition: Edition,
|
||||
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
|
||||
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
|
||||
/// Places where raw identifiers were used. This is used to avoid complaining about idents
|
||||
/// clashing with keywords in new editions.
|
||||
pub raw_identifier_spans: Lock<Vec<Span>>,
|
||||
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
|
||||
/// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
|
||||
/// provide a single error per unique incorrect identifier.
|
||||
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
|
||||
source_map: Lrc<SourceMap>,
|
||||
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
|
||||
/// Contains the spans of block expressions that could have been incomplete based on the
|
||||
@ -160,6 +165,7 @@ impl ParseSess {
|
||||
edition: ExpnId::root().expn_data().edition,
|
||||
missing_fragment_specifiers: Default::default(),
|
||||
raw_identifier_spans: Lock::new(Vec::new()),
|
||||
bad_unicode_identifiers: Lock::new(Default::default()),
|
||||
source_map,
|
||||
buffered_lints: Lock::new(vec![]),
|
||||
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
|
||||
|
@ -632,7 +632,7 @@ impl<'a> Classifier<'a> {
|
||||
},
|
||||
Some(c) => c,
|
||||
},
|
||||
TokenKind::RawIdent | TokenKind::UnknownPrefix => {
|
||||
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
|
||||
Class::Ident(self.new_span(before, text))
|
||||
}
|
||||
TokenKind::Lifetime { .. } => Class::Lifetime,
|
||||
|
16
src/test/ui/parser/emoji-identifiers.rs
Normal file
16
src/test/ui/parser/emoji-identifiers.rs
Normal file
@ -0,0 +1,16 @@
|
||||
struct ABig👩👩👧👧Family; //~ ERROR identifiers cannot contain emoji
|
||||
struct 👀; //~ ERROR identifiers cannot contain emoji
|
||||
impl 👀 {
|
||||
fn full_of_✨() -> 👀 { //~ ERROR identifiers cannot contain emoji
|
||||
👀
|
||||
}
|
||||
}
|
||||
fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emoji
|
||||
👀::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
|
||||
//~^ ERROR identifiers cannot contain emoji
|
||||
}
|
||||
fn main() {
|
||||
let _ = i_like_to_😄_a_lot() ➖ 4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
|
||||
//~^ ERROR identifiers cannot contain emoji
|
||||
//~| ERROR unknown start of token: \u{2796}
|
||||
}
|
83
src/test/ui/parser/emoji-identifiers.stderr
Normal file
83
src/test/ui/parser/emoji-identifiers.stderr
Normal file
@ -0,0 +1,83 @@
|
||||
error: unknown start of token: \u{2796}
|
||||
--> $DIR/emoji-identifiers.rs:13:33
|
||||
|
|
||||
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
|
||||
| ^^
|
||||
|
|
||||
help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
|
||||
|
|
||||
LL | let _ = i_like_to_😄_a_lot() - 4;
|
||||
| ~
|
||||
|
||||
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
|
||||
--> $DIR/emoji-identifiers.rs:13:13
|
||||
|
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
|
||||
...
|
||||
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
|
||||
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
|
||||
|
||||
error: identifiers cannot contain emoji: `ABig👩👩👧👧Family`
|
||||
--> $DIR/emoji-identifiers.rs:1:8
|
||||
|
|
||||
LL | struct ABig👩👩👧👧Family;
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `👀`
|
||||
--> $DIR/emoji-identifiers.rs:2:8
|
||||
|
|
||||
LL | struct 👀;
|
||||
| ^^
|
||||
LL | impl 👀 {
|
||||
| ^^
|
||||
LL | fn full_of_✨() -> 👀 {
|
||||
| ^^
|
||||
LL | 👀
|
||||
| ^^
|
||||
...
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ^^
|
||||
LL | 👀::full_of✨()
|
||||
| ^^
|
||||
|
||||
error: identifiers cannot contain emoji: `full_of_✨`
|
||||
--> $DIR/emoji-identifiers.rs:4:8
|
||||
|
|
||||
LL | fn full_of_✨() -> 👀 {
|
||||
| ^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `i_like_to_😅_a_lot`
|
||||
--> $DIR/emoji-identifiers.rs:8:4
|
||||
|
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `full_of✨`
|
||||
--> $DIR/emoji-identifiers.rs:9:8
|
||||
|
|
||||
LL | 👀::full_of✨()
|
||||
| ^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emoji: `i_like_to_😄_a_lot`
|
||||
--> $DIR/emoji-identifiers.rs:13:13
|
||||
|
|
||||
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
|
||||
--> $DIR/emoji-identifiers.rs:9:8
|
||||
|
|
||||
LL | struct 👀;
|
||||
| ---------- function or associated item `full_of✨` not found for this
|
||||
...
|
||||
LL | 👀::full_of✨()
|
||||
| ^^^^^^^^^
|
||||
| |
|
||||
| function or associated item not found in `👀`
|
||||
| help: there is an associated function with a similar name: `full_of_✨`
|
||||
|
||||
error: aborting due to 9 previous errors
|
||||
|
||||
Some errors have detailed explanations: E0425, E0599.
|
||||
For more information about an error, try `rustc --explain E0425`.
|
@ -1 +1 @@
|
||||
Subproject commit e1fb17631eb1b3665cdbe45b1c186111577ef512
|
||||
Subproject commit 7f08ace4f1305de7f3b1b0e2f765911957226bd4
|
@ -82,8 +82,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
|
||||
"bitflags",
|
||||
"block-buffer",
|
||||
"block-padding",
|
||||
"byteorder",
|
||||
"byte-tools",
|
||||
"byteorder",
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"chalk-derive",
|
||||
@ -140,9 +140,9 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
|
||||
"memmap2",
|
||||
"memoffset",
|
||||
"miniz_oxide",
|
||||
"num_cpus",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"object",
|
||||
"odht",
|
||||
"once_cell",
|
||||
@ -190,8 +190,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
|
||||
"serde_json",
|
||||
"sha-1",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"snap",
|
||||
"stable_deref_trait",
|
||||
"stacker",
|
||||
@ -211,6 +211,11 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[
|
||||
"tracing-subscriber",
|
||||
"tracing-tree",
|
||||
"typenum",
|
||||
"unic-char-property",
|
||||
"unic-char-range",
|
||||
"unic-common",
|
||||
"unic-emoji-char",
|
||||
"unic-ucd-version",
|
||||
"unicode-normalization",
|
||||
"unicode-script",
|
||||
"unicode-security",
|
||||
|
Loading…
Reference in New Issue
Block a user