mirror of
https://github.com/rust-lang/rust.git
synced 2025-05-14 02:49:40 +00:00
Detect confusing unicode characters and show the alternative
This commit is contained in:
parent
929ca3cb51
commit
7f63c7cf4c
@ -174,6 +174,10 @@ impl SpanHandler {
|
||||
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
|
||||
panic!(ExplicitBug);
|
||||
}
|
||||
pub fn span_bug_no_panic(&self, sp: Span, msg: &str) {
|
||||
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
|
||||
self.handler.bump_err_count();
|
||||
}
|
||||
pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! {
|
||||
self.span_bug(sp, &format!("unimplemented {}", msg));
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ use std::rc::Rc;
|
||||
pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};
|
||||
|
||||
pub mod comments;
|
||||
mod unicode_chars;
|
||||
|
||||
pub trait Reader {
|
||||
fn is_eof(&self) -> bool;
|
||||
@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> {
|
||||
c => {
|
||||
let last_bpos = self.last_pos;
|
||||
let bpos = self.pos;
|
||||
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c));
|
||||
unicode_chars::check_for_substitution(&self, c);
|
||||
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
186
src/libsyntax/parse/lexer/unicode_chars.rs
Normal file
186
src/libsyntax/parse/lexer/unicode_chars.rs
Normal file
@ -0,0 +1,186 @@
|
||||
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// Characters and their corresponding confusables were collected from
|
||||
// http://www.unicode.org/Public/security/revision-06/confusables.txt
|
||||
|
||||
use codemap::mk_sp as make_span;
|
||||
use super::StringReader;
|
||||
|
||||
const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
|
||||
('ߺ', "Nko Lajanyalan", '_'),
|
||||
('﹍', "Dashed Low Line", '_'),
|
||||
('﹎', "Centreline Low Line", '_'),
|
||||
('﹏', "Wavy Low Line", '_'),
|
||||
('‐', "Hyphen", '-'),
|
||||
('‑', "Non-Breaking Hyphen", '-'),
|
||||
('‒', "Figure Dash", '-'),
|
||||
('–', "En Dash", '-'),
|
||||
('﹘', "Small Em Dash", '-'),
|
||||
('⁃', "Hyphen Bullet", '-'),
|
||||
('˗', "Modifier Letter Minus Sign", '-'),
|
||||
('−', "Minus Sign", '-'),
|
||||
('٫', "Arabic Decimal Separator", ','),
|
||||
('‚', "Single Low-9 Quotation Mark", ','),
|
||||
('ꓹ', "Lisu Letter Tone Na Po", ','),
|
||||
(';', "Greek Question Mark", ';'),
|
||||
('ः', "Devanagari Sign Visarga", ':'),
|
||||
('ઃ', "Gujarati Sign Visarga", ':'),
|
||||
(':', "Fullwidth Colon", ':'),
|
||||
('։', "Armenian Full Stop", ':'),
|
||||
('܃', "Syriac Supralinear Colon", ':'),
|
||||
('܄', "Syriac Sublinear Colon", ':'),
|
||||
('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
|
||||
('᠃', "Mongolian Full Stop", ':'),
|
||||
('᠉', "Mongolian Manchu Full Stop", ':'),
|
||||
('⁚', "Two Dot Punctuation", ':'),
|
||||
('׃', "Hebrew Punctuation Sof Pasuq", ':'),
|
||||
('˸', "Modifier Letter Raised Colon", ':'),
|
||||
('꞉', "Modifier Letter Colon", ':'),
|
||||
('∶', "Ratio", ':'),
|
||||
('ː', "Modifier Letter Triangular Colon", ':'),
|
||||
('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
|
||||
('!', "Fullwidth Exclamation Mark", '!'),
|
||||
('ǃ', "Latin Letter Retroflex Click", '!'),
|
||||
('ʔ', "Latin Letter Glottal Stop", '?'),
|
||||
('ॽ', "Devanagari Letter Glottal Stop", '?'),
|
||||
('Ꭾ', "Cherokee Letter He", '?'),
|
||||
('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
|
||||
('․', "One Dot Leader", '.'),
|
||||
('۔', "Arabic Full Stop", '.'),
|
||||
('܁', "Syriac Supralinear Full Stop", '.'),
|
||||
('܂', "Syriac Sublinear Full Stop", '.'),
|
||||
('꘎', "Vai Full Stop", '.'),
|
||||
('𐩐', "Kharoshthi Punctuation Dot", '.'),
|
||||
('٠', "Arabic-Indic Digit Zero", '.'),
|
||||
('۰', "Extended Arabic-Indic Digit Zero", '.'),
|
||||
('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
|
||||
('՝', "Armenian Comma", '\''),
|
||||
(''', "Fullwidth Apostrophe", '\''),
|
||||
('‘', "Left Single Quotation Mark", '\''),
|
||||
('’', "Right Single Quotation Mark", '\''),
|
||||
('‛', "Single High-Reversed-9 Quotation Mark", '\''),
|
||||
('′', "Prime", '\''),
|
||||
('‵', "Reversed Prime", '\''),
|
||||
('՚', "Armenian Apostrophe", '\''),
|
||||
('׳', "Hebrew Punctuation Geresh", '\''),
|
||||
('`', "Greek Varia", '\''),
|
||||
('`', "Fullwidth Grave Accent", '\''),
|
||||
('΄', "Greek Tonos", '\''),
|
||||
('´', "Greek Oxia", '\''),
|
||||
('᾽', "Greek Koronis", '\''),
|
||||
('᾿', "Greek Psili", '\''),
|
||||
('῾', "Greek Dasia", '\''),
|
||||
('ʹ', "Modifier Letter Prime", '\''),
|
||||
('ʹ', "Greek Numeral Sign", '\''),
|
||||
('ˊ', "Modifier Letter Acute Accent", '\''),
|
||||
('ˋ', "Modifier Letter Grave Accent", '\''),
|
||||
('˴', "Modifier Letter Middle Grave Accent", '\''),
|
||||
('ʻ', "Modifier Letter Turned Comma", '\''),
|
||||
('ʽ', "Modifier Letter Reversed Comma", '\''),
|
||||
('ʼ', "Modifier Letter Apostrophe", '\''),
|
||||
('ʾ', "Modifier Letter Right Half Ring", '\''),
|
||||
('ꞌ', "Latin Small Letter Saltillo", '\''),
|
||||
('י', "Hebrew Letter Yod", '\''),
|
||||
('ߴ', "Nko High Tone Apostrophe", '\''),
|
||||
('ߵ', "Nko Low Tone Apostrophe", '\''),
|
||||
('[', "Fullwidth Left Square Bracket", '('),
|
||||
('❨', "Medium Left Parenthesis Ornament", '('),
|
||||
('❲', "Light Left Tortoise Shell Bracket Ornament", '('),
|
||||
('〔', "Left Tortoise Shell Bracket", '('),
|
||||
('﴾', "Ornate Left Parenthesis", '('),
|
||||
(']', "Fullwidth Right Square Bracket", ')'),
|
||||
('❩', "Medium Right Parenthesis Ornament", ')'),
|
||||
('❳', "Light Right Tortoise Shell Bracket Ornament", ')'),
|
||||
('〕', "Right Tortoise Shell Bracket", ')'),
|
||||
('﴿', "Ornate Right Parenthesis", ')'),
|
||||
('❴', "Medium Left Curly Bracket Ornament", '{'),
|
||||
('❵', "Medium Right Curly Bracket Ornament", '}'),
|
||||
('⁎', "Low Asterisk", '*'),
|
||||
('٭', "Arabic Five Pointed Star", '*'),
|
||||
('∗', "Asterisk Operator", '*'),
|
||||
('᜵', "Philippine Single Punctuation", '/'),
|
||||
('⁁', "Caret Insertion Point", '/'),
|
||||
('∕', "Division Slash", '/'),
|
||||
('⁄', "Fraction Slash", '/'),
|
||||
('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
|
||||
('⟋', "Mathematical Rising Diagonal", '/'),
|
||||
('⧸', "Big Solidus", '/'),
|
||||
('㇓', "Cjk Stroke Sp", '/'),
|
||||
('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
|
||||
('丿', "Cjk Unified Ideograph-4E3F", '/'),
|
||||
('⼃', "Kangxi Radical Slash", '/'),
|
||||
('\', "Fullwidth Reverse Solidus", '\\'),
|
||||
('﹨', "Small Reverse Solidus", '\\'),
|
||||
('∖', "Set Minus", '\\'),
|
||||
('⟍', "Mathematical Falling Diagonal", '\\'),
|
||||
('⧵', "Reverse Solidus Operator", '\\'),
|
||||
('⧹', "Big Reverse Solidus", '\\'),
|
||||
('㇔', "Cjk Stroke D", '\\'),
|
||||
('丶', "Cjk Unified Ideograph-4E36", '\\'),
|
||||
('⼂', "Kangxi Radical Dot", '\\'),
|
||||
('ꝸ', "Latin Small Letter Um", '&'),
|
||||
('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
|
||||
('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
|
||||
('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
|
||||
('˂', "Modifier Letter Left Arrowhead", '<'),
|
||||
('꓿', "Lisu Punctuation Full Stop", '='),
|
||||
('›', "Single Right-Pointing Angle Quotation Mark", '>'),
|
||||
('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
|
||||
('˃', "Modifier Letter Right Arrowhead", '>'),
|
||||
('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'),
|
||||
('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
|
||||
('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ];
|
||||
|
||||
const ASCII_ARRAY: &'static [(char, &'static str)] = &[
|
||||
('_', "Underscore"),
|
||||
('-', "Minus/Hyphen"),
|
||||
(',', "Comma"),
|
||||
(';', "Semicolon"),
|
||||
(':', "Colon"),
|
||||
('!', "Exclamation Mark"),
|
||||
('?', "Question Mark"),
|
||||
('.', "Period"),
|
||||
('\'', "Single Quote"),
|
||||
('(', "Left Parenthesis"),
|
||||
(')', "Right Parenthesis"),
|
||||
('{', "Left Curly Brace"),
|
||||
('}', "Right Curly Brace"),
|
||||
('*', "Asterisk"),
|
||||
('/', "Slash"),
|
||||
('\\', "Backslash"),
|
||||
('&', "Ampersand"),
|
||||
('+', "Plus Sign"),
|
||||
('<', "Less-Than Sign"),
|
||||
('=', "Equals Sign"),
|
||||
('>', "Greater-Than Sign"), ];
|
||||
|
||||
pub fn check_for_substitution(reader: &StringReader, ch: char) {
|
||||
UNICODE_ARRAY
|
||||
.iter()
|
||||
.find(|&&(c, _, _)| c == ch)
|
||||
.map(|&(_, u_name, ascii_char)| {
|
||||
let span = make_span(reader.last_pos, reader.pos);
|
||||
match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
|
||||
Some(&(ascii_char, ascii_name)) => {
|
||||
let msg =
|
||||
format!("unicode character '{}' ({}) looks much like '{}' ({}), but it's not",
|
||||
ch, u_name, ascii_char, ascii_name);
|
||||
reader.help_span(span, &msg);
|
||||
},
|
||||
None => {
|
||||
reader
|
||||
.span_diagnostic
|
||||
.span_bug_no_panic(span,
|
||||
&format!("substitution character not found for '{}'", ch));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
18
src/test/parse-fail/unicode-chars.rs
Normal file
18
src/test/parse-fail/unicode-chars.rs
Normal file
@ -0,0 +1,18 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// compile-flags: -Z parse-only
|
||||
// ignore-tidy-linelength
|
||||
|
||||
fn main() {
|
||||
let y = 0;
|
||||
//~^ ERROR unknown start of token: \u{37e}
|
||||
//~^^ HELP unicode character ';' (Greek Question Mark) looks much like ';' (Semicolon), but it's not
|
||||
}
|
Loading…
Reference in New Issue
Block a user