From 395ee0b79f23b90593b01dd0a78451b8c93b0aa6 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 6 May 2019 11:53:40 +0300 Subject: [PATCH] Introduce rustc_lexer The idea here is to make a reusable library out of the existing rust-lexer, by separating out pure lexing and rustc-specific concerns, like spans, error reporting an interning. So, rustc_lexer operates directly on `&str`, produces simple tokens which are a pair of type-tag and a bit of original text, and does not report errors, instead storing them as flags on the token. --- Cargo.lock | 8 + src/librustc_lexer/Cargo.toml | 9 + src/librustc_lexer/src/cursor.rs | 57 + src/librustc_lexer/src/lib.rs | 710 ++++++++ src/libsyntax/Cargo.toml | 1 + src/libsyntax/parse/lexer/comments.rs | 290 +--- src/libsyntax/parse/lexer/mod.rs | 1451 +++++------------ src/libsyntax/parse/lexer/unicode_chars.rs | 41 +- src/libsyntax/util/parser_testing.rs | 8 +- ...nicode-confusable-in-float-literal-expt.rs | 1 + ...de-confusable-in-float-literal-expt.stderr | 8 +- .../ui/parser/lex-bad-numeric-literals.stderr | 4 +- src/test/ui/parser/raw-byte-string-eof.stderr | 4 +- .../ui/parser/raw-byte-string-literals.stderr | 4 +- src/test/ui/parser/raw-str-delim.stderr | 2 +- 15 files changed, 1337 insertions(+), 1261 deletions(-) create mode 100644 src/librustc_lexer/Cargo.toml create mode 100644 src/librustc_lexer/src/cursor.rs create mode 100644 src/librustc_lexer/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8fd32989de3..bdc746c0bb0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2972,6 +2972,13 @@ dependencies = [ "tempfile 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rustc_lexer" +version = "0.1.0" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rustc_lint" version = "0.0.0" @@ -3622,6 +3629,7 @@ dependencies = [ "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "rustc_data_structures 0.0.0", "rustc_errors 0.0.0", + "rustc_lexer 0.1.0", "rustc_macros 0.1.0", "rustc_target 0.0.0", "scoped-tls 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/src/librustc_lexer/Cargo.toml b/src/librustc_lexer/Cargo.toml new file mode 100644 index 00000000000..9c0230e8322 --- /dev/null +++ b/src/librustc_lexer/Cargo.toml @@ -0,0 +1,9 @@ +[package] +authors = ["The Rust Project Developers"] +name = "rustc_lexer" +version = "0.1.0" +edition = "2018" + +# Note that this crate purposefully does not depend on other rustc crates +[dependencies] +unicode-xid = { version = "0.1.0", optional = true } diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs new file mode 100644 index 00000000000..5831159c344 --- /dev/null +++ b/src/librustc_lexer/src/cursor.rs @@ -0,0 +1,57 @@ +use std::str::Chars; + +pub(crate) struct Cursor<'a> { + initial_len: usize, + chars: Chars<'a>, + #[cfg(debug_assertions)] + prev: char, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + initial_len: input.len(), + chars: input.chars(), + #[cfg(debug_assertions)] + prev: EOF_CHAR, + } + } + /// For debug assertions only + pub(crate) fn prev(&self) -> char { + #[cfg(debug_assertions)] + { + self.prev + } + + #[cfg(not(debug_assertions))] + { + '\0' + } + } + pub(crate) fn nth_char(&self, n: usize) -> char { + self.chars().nth(n).unwrap_or(EOF_CHAR) + } + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + pub(crate) fn len_consumed(&self) -> usize { + self.initial_len - self.chars.as_str().len() + } + /// Returns an iterator over the remaining characters. + fn chars(&self) -> Chars<'a> { + self.chars.clone() + } + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev = c; + } + + Some(c) + } +} diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs new file mode 100644 index 00000000000..a21190ec332 --- /dev/null +++ b/src/librustc_lexer/src/lib.rs @@ -0,0 +1,710 @@ +// We want to be able to build this crate with a stable compiler, so feature +// flags should optional. +#![cfg_attr(not(feature = "unicode-xid"), feature(rustc_private))] +#![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))] + +mod cursor; + +use crate::cursor::{Cursor, EOF_CHAR}; + +pub struct Token { + pub kind: TokenKind, + pub len: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TokenKind { + LineComment, + BlockComment { terminated: bool }, + Whitespace, + Ident, + RawIdent, + Literal { kind: LiteralKind, suffix_start: usize }, + Lifetime { starts_with_number: bool }, + Semi, + Comma, + DotDotDot, + DotDotEq, + DotDot, + Dot, + OpenParen, + CloseParen, + OpenBrace, + CloseBrace, + OpenBracket, + CloseBracket, + At, + Pound, + Tilde, + Question, + ColonColon, + Colon, + Dollar, + EqEq, + Eq, + FatArrow, + Ne, + Not, + Le, + LArrow, + Lt, + ShlEq, + Shl, + Ge, + Gt, + ShrEq, + Shr, + RArrow, + Minus, + MinusEq, + And, + AndAnd, + AndEq, + Or, + OrOr, + OrEq, + PlusEq, + Plus, + StarEq, + Star, + SlashEq, + Slash, + CaretEq, + Caret, + PercentEq, + Percent, + Unknown, +} +use self::TokenKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + Int { base: Base, empty_int: bool }, + Float { base: Base, empty_exponent: bool }, + Char { terminated: bool }, + Byte { terminated: bool }, + Str { terminated: bool }, + ByteStr { terminated: bool }, + RawStr { n_hashes: usize, started: bool, terminated: bool }, + RawByteStr { n_hashes: usize, started: bool, terminated: bool }, +} +use self::LiteralKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + Binary, + Octal, + Hexadecimal, + Decimal, +} + +impl Token { + fn new(kind: TokenKind, len: usize) -> Token { + Token { kind, len } + } +} + +pub fn strip_shebang(input: &str) -> Option { + debug_assert!(!input.is_empty()); + if !input.starts_with("#!") || input.starts_with("#![") { + return None; + } + Some(input.find('\n').unwrap_or(input.len())) +} + +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + Cursor::new(input).advance_token() +} + +pub fn tokenize(mut input: &str) -> impl Iterator + '_ { + std::iter::from_fn(move || { + if input.is_empty() { + return None; + } + let token = first_token(input); + input = &input[token.len..]; + Some(token) + }) +} + +impl Cursor<'_> { + fn advance_token(&mut self) -> Token { + let first_char = self.bump().unwrap(); + let token_kind = match first_char { + '/' => match self.nth_char(0) { + '/' => self.line_comment(), + '*' => self.block_comment(), + _ => { + if self.eat_assign() { + SlashEq + } else { + Slash + } + } + }, + c if character_properties::is_whitespace(c) => self.whitespace(), + 'r' => match (self.nth_char(0), self.nth_char(1)) { + ('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(), + ('#', _) | ('"', _) => { + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + 'b' => match (self.nth_char(0), self.nth_char(1)) { + ('\'', _) => { + self.bump(); + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Byte { terminated }; + Literal { kind, suffix_start } + } + ('"', _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = ByteStr { terminated }; + Literal { kind, suffix_start } + } + ('r', '"') | ('r', '#') => { + self.bump(); + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawByteStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + c if character_properties::is_id_start(c) => self.ident(), + c @ '0'..='9' => { + let literal_kind = self.number(c); + let suffix_start = self.len_consumed(); + self.eat_literal_suffix(); + TokenKind::Literal { kind: literal_kind, suffix_start } + } + ';' => Semi, + ',' => Comma, + '.' => { + if self.nth_char(0) == '.' { + self.bump(); + if self.nth_char(0) == '.' { + self.bump(); + DotDotDot + } else if self.nth_char(0) == '=' { + self.bump(); + DotDotEq + } else { + DotDot + } + } else { + Dot + } + } + '(' => OpenParen, + ')' => CloseParen, + '{' => OpenBrace, + '}' => CloseBrace, + '[' => OpenBracket, + ']' => CloseBracket, + '@' => At, + '#' => Pound, + '~' => Tilde, + '?' => Question, + ':' => { + if self.nth_char(0) == ':' { + self.bump(); + ColonColon + } else { + Colon + } + } + '$' => Dollar, + '=' => { + if self.nth_char(0) == '=' { + self.bump(); + EqEq + } else if self.nth_char(0) == '>' { + self.bump(); + FatArrow + } else { + Eq + } + } + '!' => { + if self.nth_char(0) == '=' { + self.bump(); + Ne + } else { + Not + } + } + '<' => match self.nth_char(0) { + '=' => { + self.bump(); + Le + } + '<' => { + self.bump(); + if self.eat_assign() { ShlEq } else { Shl } + } + '-' => { + self.bump(); + LArrow + } + _ => Lt, + }, + '>' => match self.nth_char(0) { + '=' => { + self.bump(); + Ge + } + '>' => { + self.bump(); + if self.eat_assign() { ShrEq } else { Shr } + } + _ => Gt, + }, + '-' => { + if self.nth_char(0) == '>' { + self.bump(); + RArrow + } else { + if self.eat_assign() { MinusEq } else { Minus } + } + } + '&' => { + if self.nth_char(0) == '&' { + self.bump(); + AndAnd + } else { + if self.eat_assign() { AndEq } else { And } + } + } + '|' => { + if self.nth_char(0) == '|' { + self.bump(); + OrOr + } else { + if self.eat_assign() { OrEq } else { Or } + } + } + '+' => { + if self.eat_assign() { + PlusEq + } else { + Plus + } + } + '*' => { + if self.eat_assign() { + StarEq + } else { + Star + } + } + '^' => { + if self.eat_assign() { + CaretEq + } else { + Caret + } + } + '%' => { + if self.eat_assign() { + PercentEq + } else { + Percent + } + } + '\'' => self.lifetime_or_char(), + '"' => { + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Str { terminated }; + Literal { kind, suffix_start } + } + _ => Unknown, + }; + Token::new(token_kind, self.len_consumed()) + } + + fn line_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '/'); + self.bump(); + loop { + match self.nth_char(0) { + '\n' => break, + '\r' if self.nth_char(1) == '\n' => break, + EOF_CHAR if self.is_eof() => break, + _ => { + self.bump(); + } + } + } + LineComment + } + + fn block_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '*'); + self.bump(); + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.nth_char(0) == '*' => { + self.bump(); + depth += 1; + } + '*' if self.nth_char(0) == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + break; + } + } + _ => (), + } + } + + BlockComment { terminated: depth == 0 } + } + + fn whitespace(&mut self) -> TokenKind { + debug_assert!(character_properties::is_whitespace(self.prev())); + while character_properties::is_whitespace(self.nth_char(0)) { + self.bump(); + } + Whitespace + } + + fn raw_ident(&mut self) -> TokenKind { + debug_assert!( + self.prev() == 'r' + && self.nth_char(0) == '#' + && character_properties::is_id_start(self.nth_char(1)) + ); + self.bump(); + self.bump(); + while character_properties::is_id_continue(self.nth_char(0)) { + self.bump(); + } + RawIdent + } + + fn ident(&mut self) -> TokenKind { + debug_assert!(character_properties::is_id_start(self.prev())); + while character_properties::is_id_continue(self.nth_char(0)) { + self.bump(); + } + Ident + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + debug_assert!('0' <= self.prev() && self.prev() <= '9'); + let mut base = Base::Decimal; + if first_digit == '0' { + let has_digits = match self.nth_char(0) { + 'b' => { + base = Base::Binary; + self.bump(); + self.eat_decimal_digits() + } + 'o' => { + base = Base::Octal; + self.bump(); + self.eat_decimal_digits() + } + 'x' => { + base = Base::Hexadecimal; + self.bump(); + self.eat_hexadecimal_digits() + } + '0'..='9' | '_' | '.' | 'e' | 'E' => { + self.eat_decimal_digits(); + true + } + // just a 0 + _ => return Int { base, empty_int: false }, + }; + if !has_digits { + return Int { base, empty_int: true }; + } + } else { + self.eat_decimal_digits(); + }; + + match self.nth_char(0) { + // Don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + '.' if self.nth_char(1) != '.' + && !character_properties::is_id_start(self.nth_char(1)) => + { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.nth_char(0).is_digit(10) { + self.eat_decimal_digits(); + match self.nth_char(0) { + 'e' | 'E' => { + self.bump(); + empty_exponent = self.float_exponent().is_err() + } + _ => (), + } + } + Float { base, empty_exponent } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = self.float_exponent().is_err(); + Float { base, empty_exponent } + } + _ => Int { base, empty_int: false }, + } + } + + fn lifetime_or_char(&mut self) -> TokenKind { + debug_assert!(self.prev() == '\''); + let mut starts_with_number = false; + if (character_properties::is_id_start(self.nth_char(0)) + || self.nth_char(0).is_digit(10) && { + starts_with_number = true; + true + }) + && self.nth_char(1) != '\'' + { + self.bump(); + while character_properties::is_id_continue(self.nth_char(0)) { + self.bump(); + } + + return if self.nth_char(0) == '\'' { + self.bump(); + let kind = Char { terminated: true }; + Literal { kind, suffix_start: self.len_consumed() } + } else { + Lifetime { starts_with_number } + }; + } + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Char { terminated }; + return Literal { kind, suffix_start }; + } + + fn single_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '\''); + // parse `'''` as a single char literal + if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { + self.bump(); + } + let mut first = true; + loop { + match self.nth_char(0) { + '/' if !first => break, + '\n' if self.nth_char(1) != '\'' => break, + '\r' if self.nth_char(1) == '\n' => break, + EOF_CHAR if self.is_eof() => break, + '\'' => { + self.bump(); + return true; + } + '\\' => { + self.bump(); + self.bump(); + } + _ => { + self.bump(); + } + } + first = false; + } + false + } + + fn double_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '"'); + loop { + match self.nth_char(0) { + '"' => { + self.bump(); + return true; + } + EOF_CHAR if self.is_eof() => return false, + '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => { + self.bump(); + } + _ => (), + } + self.bump(); + } + } + + fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { + debug_assert!(self.prev() == 'r'); + let n_hashes = { + let mut acc: usize = 0; + loop { + match self.bump() { + Some('#') => acc += 1, + Some('"') => break acc, + None | Some(_) => return (acc, false, false), + } + } + }; + + loop { + match self.bump() { + Some('"') => { + let mut acc = n_hashes; + while self.nth_char(0) == '#' && acc > 0 { + self.bump(); + acc -= 1; + } + if acc == 0 { + return (n_hashes, true, true); + } + } + Some(_) => (), + None => return (n_hashes, true, false), + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn float_exponent(&mut self) -> Result<(), ()> { + debug_assert!(self.prev() == 'e' || self.prev() == 'E'); + if self.nth_char(0) == '-' || self.nth_char(0) == '+' { + self.bump(); + } + if self.eat_decimal_digits() { Ok(()) } else { Err(()) } + } + + fn eat_literal_suffix(&mut self) { + if !character_properties::is_id_start(self.nth_char(0)) { + return; + } + self.bump(); + + while character_properties::is_id_continue(self.nth_char(0)) { + self.bump(); + } + } + + fn eat_assign(&mut self) -> bool { + if self.nth_char(0) == '=' { + self.bump(); + true + } else { + false + } + } +} + +pub mod character_properties { + // this is Pattern_White_Space + #[cfg(feature = "unicode-xid")] + pub fn is_whitespace(c: char) -> bool { + match c { + '\u{0009}' | '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}' + | '\u{0085}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } + } + + #[cfg(not(feature = "unicode-xid"))] + pub fn is_whitespace(c: char) -> bool { + core::unicode::property::Pattern_White_Space(c) + } + + // this is XID_Start OR '_' (which formally is not a XID_Start) + #[cfg(feature = "unicode-xid")] + pub fn is_id_start(c: char) -> bool { + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) + } + + #[cfg(not(feature = "unicode-xid"))] + pub fn is_id_start(c: char) -> bool { + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || c == '_' + || (c > '\x7f' && c.is_xid_start()) + } + + // this is XID_Continue + #[cfg(feature = "unicode-xid")] + pub fn is_id_continue(c: char) -> bool { + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) + } + + #[cfg(not(feature = "unicode-xid"))] + pub fn is_id_continue(c: char) -> bool { + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + || (c > '\x7f' && c.is_xid_continue()) + } +} diff --git a/src/libsyntax/Cargo.toml b/src/libsyntax/Cargo.toml index c5daa656476..15c685b3b7b 100644 --- a/src/libsyntax/Cargo.toml +++ b/src/libsyntax/Cargo.toml @@ -18,6 +18,7 @@ lazy_static = "1.0.0" syntax_pos = { path = "../libsyntax_pos" } errors = { path = "../librustc_errors", package = "rustc_errors" } rustc_data_structures = { path = "../librustc_data_structures" } +rustc_lexer = { path = "../librustc_lexer" } rustc_macros = { path = "../librustc_macros" } rustc_target = { path = "../librustc_target" } smallvec = { version = "0.6.7", features = ["union", "may_dangle"] } diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index 6ed2a7adad1..d8f22072d7d 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -2,11 +2,10 @@ pub use CommentStyle::*; use crate::ast; use crate::source_map::SourceMap; -use crate::parse::lexer::{is_block_doc_comment, is_pattern_whitespace}; -use crate::parse::lexer::{self, ParseSess, StringReader}; +use crate::parse::lexer::is_block_doc_comment; +use crate::parse::lexer::ParseSess; use syntax_pos::{BytePos, CharPos, Pos, FileName}; -use log::debug; use std::usize; @@ -135,66 +134,6 @@ pub fn strip_doc_comment_decoration(comment: &str) -> String { panic!("not a doc-comment: {}", comment); } -fn push_blank_line_comment(rdr: &StringReader<'_>, comments: &mut Vec) { - debug!(">>> blank-line comment"); - comments.push(Comment { - style: BlankLine, - lines: Vec::new(), - pos: rdr.pos, - }); -} - -fn consume_whitespace_counting_blank_lines( - rdr: &mut StringReader<'_>, - comments: &mut Vec -) { - while is_pattern_whitespace(rdr.ch) && !rdr.is_eof() { - if rdr.ch_is('\n') { - push_blank_line_comment(rdr, &mut *comments); - } - rdr.bump(); - } -} - -fn read_shebang_comment(rdr: &mut StringReader<'_>, - code_to_the_left: bool, - comments: &mut Vec) { - debug!(">>> shebang comment"); - let p = rdr.pos; - debug!("<<< shebang comment"); - comments.push(Comment { - style: if code_to_the_left { Trailing } else { Isolated }, - lines: vec![rdr.read_one_line_comment()], - pos: p, - }); -} - -fn read_line_comments(rdr: &mut StringReader<'_>, - code_to_the_left: bool, - comments: &mut Vec) { - debug!(">>> line comments"); - let p = rdr.pos; - let mut lines: Vec = Vec::new(); - while rdr.ch_is('/') && rdr.nextch_is('/') { - let line = rdr.read_one_line_comment(); - debug!("{}", line); - // Doc comments are not put in comments. - if is_doc_comment(&line[..]) { - break; - } - lines.push(line); - rdr.consume_non_eol_whitespace(); - } - debug!("<<< line comments"); - if !lines.is_empty() { - comments.push(Comment { - style: if code_to_the_left { Trailing } else { Isolated }, - lines, - pos: p, - }); - } -} - /// Returns `None` if the first `col` chars of `s` contain a non-whitespace char. /// Otherwise returns `Some(k)` where `k` is first char offset after that leading /// whitespace. Note that `k` may be outside bounds of `s`. @@ -209,170 +148,103 @@ fn all_whitespace(s: &str, col: CharPos) -> Option { Some(idx) } -fn trim_whitespace_prefix_and_push_line(lines: &mut Vec, s: String, col: CharPos) { +fn trim_whitespace_prefix(s: &str, col: CharPos) -> &str { let len = s.len(); - let s1 = match all_whitespace(&s[..], col) { - Some(col) => { - if col < len { - s[col..len].to_string() - } else { - String::new() - } - } + match all_whitespace(&s, col) { + Some(col) => if col < len { &s[col..] } else { "" }, None => s, - }; - debug!("pushing line: {}", s1); - lines.push(s1); + } } -fn read_block_comment(rdr: &mut StringReader<'_>, - code_to_the_left: bool, - comments: &mut Vec) { - debug!(">>> block comment"); - let p = rdr.pos; - let mut lines: Vec = Vec::new(); - - // Count the number of chars since the start of the line by rescanning. - let src_index = rdr.src_index(rdr.source_file.line_begin_pos(rdr.pos)); - let end_src_index = rdr.src_index(rdr.pos); - assert!(src_index <= end_src_index, - "src_index={}, end_src_index={}, line_begin_pos={}", - src_index, end_src_index, rdr.source_file.line_begin_pos(rdr.pos).to_u32()); - - let col = CharPos(rdr.src[src_index..end_src_index].chars().count()); - - rdr.bump(); - rdr.bump(); - - let mut curr_line = String::from("/*"); - - // doc-comments are not really comments, they are attributes - if (rdr.ch_is('*') && !rdr.nextch_is('*')) || rdr.ch_is('!') { - while !(rdr.ch_is('*') && rdr.nextch_is('/')) && !rdr.is_eof() { - curr_line.push(rdr.ch.unwrap()); - rdr.bump(); - } - if !rdr.is_eof() { - curr_line.push_str("*/"); - rdr.bump(); - rdr.bump(); - } - if is_block_doc_comment(&curr_line[..]) { - return; - } - assert!(!curr_line.contains('\n')); - lines.push(curr_line); - } else { - let mut level: isize = 1; - while level > 0 { - debug!("=== block comment level {}", level); - if rdr.is_eof() { - rdr.fatal_span_(rdr.pos, rdr.pos, "unterminated block comment").raise(); - } - if rdr.ch_is('\n') { - trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col); - curr_line = String::new(); - rdr.bump(); - } else { - curr_line.push(rdr.ch.unwrap()); - if rdr.ch_is('/') && rdr.nextch_is('*') { - rdr.bump(); - rdr.bump(); - curr_line.push('*'); - level += 1; - } else { - if rdr.ch_is('*') && rdr.nextch_is('/') { - rdr.bump(); - rdr.bump(); - curr_line.push('/'); - level -= 1; - } else { - rdr.bump(); - } - } - } - } - if !curr_line.is_empty() { - trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col); - } +fn split_block_comment_into_lines( + text: &str, + col: CharPos, +) -> Vec { + let mut res: Vec = vec![]; + let mut lines = text.lines(); + // just push the first line + res.extend(lines.next().map(|it| it.to_string())); + // for other lines, strip common whitespace prefix + for line in lines { + res.push(trim_whitespace_prefix(line, col).to_string()) } - - let mut style = if code_to_the_left { - Trailing - } else { - Isolated - }; - rdr.consume_non_eol_whitespace(); - if !rdr.is_eof() && !rdr.ch_is('\n') && lines.len() == 1 { - style = Mixed; - } - debug!("<<< block comment"); - comments.push(Comment { - style, - lines, - pos: p, - }); -} - - -fn consume_comment(rdr: &mut StringReader<'_>, - comments: &mut Vec, - code_to_the_left: &mut bool, - anything_to_the_left: &mut bool) { - debug!(">>> consume comment"); - if rdr.ch_is('/') && rdr.nextch_is('/') { - read_line_comments(rdr, *code_to_the_left, comments); - *code_to_the_left = false; - *anything_to_the_left = false; - } else if rdr.ch_is('/') && rdr.nextch_is('*') { - read_block_comment(rdr, *code_to_the_left, comments); - *anything_to_the_left = true; - } else if rdr.ch_is('#') && rdr.nextch_is('!') { - read_shebang_comment(rdr, *code_to_the_left, comments); - *code_to_the_left = false; - *anything_to_the_left = false; - } else { - panic!(); - } - debug!("<<< consume comment"); + res } // it appears this function is called only from pprust... that's // probably not a good thing. -pub fn gather_comments(sess: &ParseSess, path: FileName, src: String) -> Vec -{ +pub fn gather_comments(sess: &ParseSess, path: FileName, src: String) -> Vec { let cm = SourceMap::new(sess.source_map().path_mapping().clone()); let source_file = cm.new_source_file(path, src); - let mut rdr = lexer::StringReader::new(sess, source_file, None); + let text = (*source_file.src.as_ref().unwrap()).clone(); + let text: &str = text.as_str(); + let start_bpos = source_file.start_pos; + let mut pos = 0; let mut comments: Vec = Vec::new(); - let mut code_to_the_left = false; // Only code - let mut anything_to_the_left = false; // Code or comments + let mut code_to_the_left = false; - while !rdr.is_eof() { - loop { - // Eat all the whitespace and count blank lines. - rdr.consume_non_eol_whitespace(); - if rdr.ch_is('\n') { - if anything_to_the_left { - rdr.bump(); // The line is not blank, do not count. + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + comments.push(Comment { + style: Isolated, + lines: vec![text[..shebang_len].to_string()], + pos: start_bpos, + }); + pos += shebang_len; + } + + for token in rustc_lexer::tokenize(&text[pos..]) { + let token_text = &text[pos..pos + token.len]; + match token.kind { + rustc_lexer::TokenKind::Whitespace => { + if let Some(mut idx) = token_text.find('\n') { + code_to_the_left = false; + while let Some(next_newline) = &token_text[idx + 1..].find('\n') { + idx = idx + 1 + next_newline; + comments.push(Comment { + style: BlankLine, + lines: vec![], + pos: start_bpos + BytePos((pos + idx) as u32), + }); + } } - consume_whitespace_counting_blank_lines(&mut rdr, &mut comments); - code_to_the_left = false; - anything_to_the_left = false; } - // Eat one comment group - if rdr.peeking_at_comment() { - consume_comment(&mut rdr, &mut comments, - &mut code_to_the_left, &mut anything_to_the_left); - } else { - break + rustc_lexer::TokenKind::BlockComment { terminated: _ } => { + if !is_block_doc_comment(token_text) { + let code_to_the_right = match text[pos + token.len..].chars().next() { + Some('\r') | Some('\n') => false, + _ => true, + }; + let style = match (code_to_the_left, code_to_the_right) { + (true, true) | (false, true) => Mixed, + (false, false) => Isolated, + (true, false) => Trailing, + }; + + // Count the number of chars since the start of the line by rescanning. + let pos_in_file = start_bpos + BytePos(pos as u32); + let line_begin_in_file = source_file.line_begin_pos(pos_in_file); + let line_begin_pos = (line_begin_in_file - start_bpos).to_usize(); + let col = CharPos(text[line_begin_pos..pos].chars().count()); + + let lines = split_block_comment_into_lines(token_text, col); + comments.push(Comment { style, lines, pos: pos_in_file }) + } + } + rustc_lexer::TokenKind::LineComment => { + if !is_doc_comment(token_text) { + comments.push(Comment { + style: if code_to_the_left { Trailing } else { Isolated }, + lines: vec![token_text.to_string()], + pos: start_bpos + BytePos(pos as u32), + }) + } + } + _ => { + code_to_the_left = true; } } - - rdr.next_token(); - code_to_the_left = true; - anything_to_the_left = true; + pos += token.len; } comments diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 7be8e57c7f8..317c49c7d35 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -6,11 +6,12 @@ use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_c use errors::{FatalError, Diagnostic, DiagnosticBuilder}; use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; -use core::unicode::property::Pattern_White_Space; +use rustc_lexer::Base; use std::borrow::Cow; use std::char; use std::iter; +use std::convert::TryInto; use rustc_data_structures::sync::Lrc; use log::debug; @@ -29,12 +30,9 @@ pub struct UnmatchedBrace { pub struct StringReader<'a> { crate sess: &'a ParseSess, - /// The absolute offset within the source_map of the next character to read - crate next_pos: BytePos, /// The absolute offset within the source_map of the current character crate pos: BytePos, /// The current character (which has been read from self.pos) - crate ch: Option, crate source_file: Lrc, /// Stop reading src at this index. crate end_src_index: usize, @@ -49,9 +47,22 @@ impl<'a> StringReader<'a> { pub fn new(sess: &'a ParseSess, source_file: Lrc, override_span: Option) -> Self { - let mut sr = StringReader::new_internal(sess, source_file, override_span); - sr.bump(); - sr + if source_file.src.is_none() { + sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}", + source_file.name)); + } + + let src = (*source_file.src.as_ref().unwrap()).clone(); + + StringReader { + sess, + pos: source_file.start_pos, + source_file, + end_src_index: src.len(), + src, + fatal_errs: Vec::new(), + override_span, + } } pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self { @@ -63,39 +74,14 @@ impl<'a> StringReader<'a> { span = span.shrink_to_lo(); } - let mut sr = StringReader::new_internal(sess, begin.sf, None); + let mut sr = StringReader::new(sess, begin.sf, None); // Seek the lexer to the right byte range. - sr.next_pos = span.lo(); sr.end_src_index = sr.src_index(span.hi()); - sr.bump(); - sr } - fn new_internal(sess: &'a ParseSess, source_file: Lrc, - override_span: Option) -> Self - { - if source_file.src.is_none() { - sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}", - source_file.name)); - } - - let src = (*source_file.src.as_ref().unwrap()).clone(); - - StringReader { - sess, - next_pos: source_file.start_pos, - pos: source_file.start_pos, - ch: Some('\n'), - source_file, - end_src_index: src.len(), - src, - fatal_errs: Vec::new(), - override_span, - } - } fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span { self.override_span.unwrap_or_else(|| Span::new(lo, hi, NO_EXPANSION)) @@ -117,19 +103,47 @@ impl<'a> StringReader<'a> { /// retrieved using `buffer_fatal_errors`. pub fn try_next_token(&mut self) -> Result { assert!(self.fatal_errs.is_empty()); - match self.scan_whitespace_or_comment() { - Some(comment) => Ok(comment), - None => { - let (kind, start_pos, end_pos) = if self.is_eof() { - (token::Eof, self.source_file.end_pos, self.source_file.end_pos) - } else { - let start_pos = self.pos; - (self.next_token_inner()?, start_pos, self.pos) - }; - let span = self.mk_sp(start_pos, end_pos); - Ok(Token::new(kind, span)) + + let start_src_index = self.src_index(self.pos); + let text: &str = &self.src[start_src_index..self.end_src_index]; + + if text.is_empty() { + let span = self.mk_sp(self.source_file.end_pos, self.source_file.end_pos); + return Ok(Token::new(token::Eof, span)); + } + + { + let is_beginning_of_file = self.pos == self.source_file.start_pos; + if is_beginning_of_file { + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + let start = self.pos; + self.pos = self.pos + BytePos::from_usize(shebang_len); + + let sym = self.symbol_from(start + BytePos::from_usize("#!".len())); + let kind = token::Shebang(sym); + + let span = self.mk_sp(start, self.pos); + return Ok(Token::new(kind, span)); + } } } + + let token = rustc_lexer::first_token(text); + + let start = self.pos; + self.pos = self.pos + BytePos::from_usize(token.len); + + debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start)); + + // This could use `?`, but that makes code significantly (10-20%) slower. + // https://github.com/rust-lang/rust/issues/37939 + let kind = match self.cook_lexer_token(token.kind, start) { + Ok(it) => it, + Err(err) => return Err(self.fatal_errs.push(err)), + }; + + let span = self.mk_sp(start, self.pos); + Ok(Token::new(kind, span)) } /// Returns the next token, including trivia like whitespace or comments. @@ -140,25 +154,7 @@ impl<'a> StringReader<'a> { self.unwrap_or_abort(res) } - #[inline] - fn is_eof(&self) -> bool { - self.ch.is_none() - } - - fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! { - let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string"); - err.span_label(self.mk_sp(pos, pos), "unterminated raw string"); - - if hash_count > 0 { - err.note(&format!("this raw string should be terminated with `\"{}`", - "#".repeat(hash_count as usize))); - } - - err.emit(); - FatalError.raise(); - } - - crate fn emit_fatal_errors(&mut self) { + fn emit_fatal_errors(&mut self) { for err in &mut self.fatal_errs { err.emit(); } @@ -176,11 +172,6 @@ impl<'a> StringReader<'a> { buffer } - #[inline] - fn ch_is(&self, c: char) -> bool { - self.ch == Some(c) - } - /// Report a fatal lexical error with a given span. fn fatal_span(&self, sp: Span, m: &str) -> FatalError { self.sess.span_diagnostic.span_fatal(sp, m) @@ -202,16 +193,6 @@ impl<'a> StringReader<'a> { self.err_span(self.mk_sp(from_pos, to_pos), m) } - /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an - /// escaped character to the error message - fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError { - let mut m = m.to_string(); - m.push_str(": "); - push_escaped_char(&mut m, c); - - self.fatal_span_(from_pos, to_pos, &m[..]) - } - fn struct_span_fatal(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> DiagnosticBuilder<'a> { @@ -228,6 +209,318 @@ impl<'a> StringReader<'a> { self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..]) } + /// Turns simple `rustc_lexer::TokenKind` enum into a rich + /// `libsyntax::TokenKind`. This turns strings into interned + /// symbols and runs additional validation. + fn cook_lexer_token( + &self, + token: rustc_lexer::TokenKind, + start: BytePos, + ) -> Result> { + let kind = match token { + rustc_lexer::TokenKind::LineComment => { + let string = self.str_from(start); + // comments with only more "/"s are not doc comments + let tok = if is_doc_comment(string) { + let mut idx = 0; + loop { + idx = match string[idx..].find('\r') { + None => break, + Some(it) => it + 1 + }; + if string[idx..].chars().next() != Some('\n') { + self.err_span_(start + BytePos(idx as u32 - 1), + start + BytePos(idx as u32), + "bare CR not allowed in doc-comment"); + } + } + token::DocComment(Symbol::intern(string)) + } else { + token::Comment + }; + + tok + } + rustc_lexer::TokenKind::BlockComment { terminated } => { + let string = self.str_from(start); + // block comments starting with "/**" or "/*!" are doc-comments + // but comments with only "*"s between two "/"s are not + let is_doc_comment = is_block_doc_comment(string); + + if !terminated { + let msg = if is_doc_comment { + "unterminated block doc-comment" + } else { + "unterminated block comment" + }; + let last_bpos = self.pos; + self.fatal_span_(start, last_bpos, msg).raise(); + } + + let tok = if is_doc_comment { + let has_cr = string.contains('\r'); + let string = if has_cr { + self.translate_crlf(start, + string, + "bare CR not allowed in block doc-comment") + } else { + string.into() + }; + token::DocComment(Symbol::intern(&string[..])) + } else { + token::Comment + }; + + tok + } + rustc_lexer::TokenKind::Whitespace => token::Whitespace, + rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => { + let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; + let mut ident_start = start; + if is_raw_ident { + ident_start = ident_start + BytePos(2); + } + // FIXME: perform NFKC normalization here. (Issue #2253) + let sym = self.symbol_from(ident_start); + if is_raw_ident { + let span = self.mk_sp(start, self.pos); + if !sym.can_be_raw() { + self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); + } + self.sess.raw_identifier_spans.borrow_mut().push(span); + } + token::Ident(sym, is_raw_ident) + } + rustc_lexer::TokenKind::Literal { kind, suffix_start } => { + let suffix_start = start + BytePos(suffix_start as u32); + let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); + let suffix = if suffix_start < self.pos { + let string = self.str_from(suffix_start); + if string == "_" { + self.sess.span_diagnostic + .struct_span_warn(self.mk_sp(suffix_start, self.pos), + "underscore literal suffix is not allowed") + .warn("this was previously accepted by the compiler but is \ + being phased out; it will become a hard error in \ + a future release!") + .note("for more information, see issue #42326 \ + ") + .emit(); + None + } else { + Some(Symbol::intern(string)) + } + } else { + None + }; + token::Literal(token::Lit { kind, symbol, suffix }) + } + rustc_lexer::TokenKind::Lifetime { starts_with_number } => { + // Include the leading `'` in the real identifier, for macro + // expansion purposes. See #12512 for the gory details of why + // this is necessary. + let lifetime_name = self.str_from(start); + if starts_with_number { + self.err_span_( + start, + self.pos, + "lifetimes cannot start with a number", + ); + } + let ident = Symbol::intern(lifetime_name); + token::Lifetime(ident) + } + rustc_lexer::TokenKind::Semi => token::Semi, + rustc_lexer::TokenKind::Comma => token::Comma, + rustc_lexer::TokenKind::DotDotDot => token::DotDotDot, + rustc_lexer::TokenKind::DotDotEq => token::DotDotEq, + rustc_lexer::TokenKind::DotDot => token::DotDot, + rustc_lexer::TokenKind::Dot => token::Dot, + rustc_lexer::TokenKind::OpenParen => token::OpenDelim(token::Paren), + rustc_lexer::TokenKind::CloseParen => token::CloseDelim(token::Paren), + rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(token::Brace), + rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(token::Brace), + rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(token::Bracket), + rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(token::Bracket), + rustc_lexer::TokenKind::At => token::At, + rustc_lexer::TokenKind::Pound => token::Pound, + rustc_lexer::TokenKind::Tilde => token::Tilde, + rustc_lexer::TokenKind::Question => token::Question, + rustc_lexer::TokenKind::ColonColon => token::ModSep, + rustc_lexer::TokenKind::Colon => token::Colon, + rustc_lexer::TokenKind::Dollar => token::Dollar, + rustc_lexer::TokenKind::EqEq => token::EqEq, + rustc_lexer::TokenKind::Eq => token::Eq, + rustc_lexer::TokenKind::FatArrow => token::FatArrow, + rustc_lexer::TokenKind::Ne => token::Ne, + rustc_lexer::TokenKind::Not => token::Not, + rustc_lexer::TokenKind::Le => token::Le, + rustc_lexer::TokenKind::LArrow => token::LArrow, + rustc_lexer::TokenKind::Lt => token::Lt, + rustc_lexer::TokenKind::ShlEq => token::BinOpEq(token::Shl), + rustc_lexer::TokenKind::Shl => token::BinOp(token::Shl), + rustc_lexer::TokenKind::Ge => token::Ge, + rustc_lexer::TokenKind::Gt => token::Gt, + rustc_lexer::TokenKind::ShrEq => token::BinOpEq(token::Shr), + rustc_lexer::TokenKind::Shr => token::BinOp(token::Shr), + rustc_lexer::TokenKind::RArrow => token::RArrow, + rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), + rustc_lexer::TokenKind::MinusEq => token::BinOpEq(token::Minus), + rustc_lexer::TokenKind::And => token::BinOp(token::And), + rustc_lexer::TokenKind::AndEq => token::BinOpEq(token::And), + rustc_lexer::TokenKind::AndAnd => token::AndAnd, + rustc_lexer::TokenKind::Or => token::BinOp(token::Or), + rustc_lexer::TokenKind::OrEq => token::BinOpEq(token::Or), + rustc_lexer::TokenKind::OrOr => token::OrOr, + rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), + rustc_lexer::TokenKind::PlusEq => token::BinOpEq(token::Plus), + rustc_lexer::TokenKind::Star => token::BinOp(token::Star), + rustc_lexer::TokenKind::StarEq => token::BinOpEq(token::Star), + rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), + rustc_lexer::TokenKind::SlashEq => token::BinOpEq(token::Slash), + rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), + rustc_lexer::TokenKind::CaretEq => token::BinOpEq(token::Caret), + rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), + rustc_lexer::TokenKind::PercentEq => token::BinOpEq(token::Percent), + + rustc_lexer::TokenKind::Unknown => { + let c = self.str_from(start).chars().next().unwrap(); + let mut err = self.struct_fatal_span_char(start, + self.pos, + "unknown start of token", + c); + unicode_chars::check_for_substitution(self, start, c, &mut err); + return Err(err) + } + }; + Ok(kind) + } + + fn cook_lexer_literal( + &self, + start: BytePos, + suffix_start: BytePos, + kind: rustc_lexer::LiteralKind + ) -> (token::LitKind, Symbol) { + match kind { + rustc_lexer::LiteralKind::Char { terminated } => { + if !terminated { + self.fatal_span_(start, suffix_start, + "unterminated character literal".into()) + .raise() + } + let content_start = start + BytePos(1); + let content_end = suffix_start - BytePos(1); + self.validate_char_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::Char, id) + }, + rustc_lexer::LiteralKind::Byte { terminated } => { + if !terminated { + self.fatal_span_(start + BytePos(1), suffix_start, + "unterminated byte constant".into()) + .raise() + } + let content_start = start + BytePos(2); + let content_end = suffix_start - BytePos(1); + self.validate_byte_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::Byte, id) + }, + rustc_lexer::LiteralKind::Str { terminated } => { + if !terminated { + self.fatal_span_(start, suffix_start, + "unterminated double quote string".into()) + .raise() + } + let content_start = start + BytePos(1); + let content_end = suffix_start - BytePos(1); + self.validate_str_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::Str, id) + } + rustc_lexer::LiteralKind::ByteStr { terminated } => { + if !terminated { + self.fatal_span_(start + BytePos(1), suffix_start, + "unterminated double quote byte string".into()) + .raise() + } + let content_start = start + BytePos(2); + let content_end = suffix_start - BytePos(1); + self.validate_byte_str_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::ByteStr, id) + } + rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => { + if !started { + self.report_non_started_raw_string(start); + } + if !terminated { + self.report_unterminated_raw_string(start, n_hashes) + } + let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); + let n = u32::from(n_hashes); + let content_start = start + BytePos(2 + n); + let content_end = suffix_start - BytePos(1 + n); + self.validate_raw_str_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::StrRaw(n_hashes), id) + } + rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => { + if !started { + self.report_non_started_raw_string(start); + } + if !terminated { + self.report_unterminated_raw_string(start, n_hashes) + } + let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); + let n = u32::from(n_hashes); + let content_start = start + BytePos(3 + n); + let content_end = suffix_start - BytePos(1 + n); + self.validate_raw_byte_str_escape(content_start, content_end); + let id = self.symbol_from_to(content_start, content_end); + (token::ByteStrRaw(n_hashes), id) + } + rustc_lexer::LiteralKind::Int { base, empty_int } => { + if empty_int { + self.err_span_(start, suffix_start, "no valid digits found for number"); + (token::Integer, sym::integer(0)) + } else { + self.validate_int_literal(base, start, suffix_start); + (token::Integer, self.symbol_from_to(start, suffix_start)) + } + }, + rustc_lexer::LiteralKind::Float { base, empty_exponent } => { + if empty_exponent { + let mut err = self.struct_span_fatal( + start, self.pos, + "expected at least one digit in exponent" + ); + err.emit(); + } + + match base { + Base::Hexadecimal => { + self.err_span_(start, suffix_start, + "hexadecimal float literal is not supported") + } + Base::Octal => { + self.err_span_(start, suffix_start, + "octal float literal is not supported") + } + Base::Binary => { + self.err_span_(start, suffix_start, + "binary float literal is not supported") + } + _ => () + } + + let id = self.symbol_from_to(start, suffix_start); + (token::Float, id) + }, + } + } + #[inline] fn src_index(&self, pos: BytePos) -> usize { (pos - self.source_file.start_pos).to_usize() @@ -304,909 +597,58 @@ impl<'a> StringReader<'a> { } } - /// Advance the StringReader by one character. - crate fn bump(&mut self) { - let next_src_index = self.src_index(self.next_pos); - if next_src_index < self.end_src_index { - let next_ch = char_at(&self.src, next_src_index); - let next_ch_len = next_ch.len_utf8(); - - self.ch = Some(next_ch); - self.pos = self.next_pos; - self.next_pos = self.next_pos + Pos::from_usize(next_ch_len); - } else { - self.ch = None; - self.pos = self.next_pos; - } + fn report_non_started_raw_string(&self, start: BytePos) -> ! { + let bad_char = self.str_from(start).chars().last().unwrap(); + self + .struct_fatal_span_char( + start, + self.pos, + "found invalid character; only `#` is allowed \ + in raw string delimitation", + bad_char, + ) + .emit(); + FatalError.raise() } - fn nextch(&self) -> Option { - let next_src_index = self.src_index(self.next_pos); - if next_src_index < self.end_src_index { - Some(char_at(&self.src, next_src_index)) - } else { - None + fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! { + let mut err = self.struct_span_fatal( + start, start, + "unterminated raw string", + ); + err.span_label( + self.mk_sp(start, start), + "unterminated raw string", + ); + + if n_hashes > 0 { + err.note(&format!("this raw string should be terminated with `\"{}`", + "#".repeat(n_hashes as usize))); } + + err.emit(); + FatalError.raise() } - #[inline] - fn nextch_is(&self, c: char) -> bool { - self.nextch() == Some(c) - } - - fn nextnextch(&self) -> Option { - let next_src_index = self.src_index(self.next_pos); - if next_src_index < self.end_src_index { - let next_next_src_index = - next_src_index + char_at(&self.src, next_src_index).len_utf8(); - if next_next_src_index < self.end_src_index { - return Some(char_at(&self.src, next_next_src_index)); - } - } - None - } - - #[inline] - fn nextnextch_is(&self, c: char) -> bool { - self.nextnextch() == Some(c) - } - - /// Eats *, if possible. - fn scan_optional_raw_name(&mut self) -> Option { - if !ident_start(self.ch) { - return None; - } - - let start = self.pos; - self.bump(); - - while ident_continue(self.ch) { - self.bump(); - } - - match self.str_from(start) { - "_" => { - self.sess.span_diagnostic - .struct_span_warn(self.mk_sp(start, self.pos), - "underscore literal suffix is not allowed") - .warn("this was previously accepted by the compiler but is \ - being phased out; it will become a hard error in \ - a future release!") - .note("for more information, see issue #42326 \ - ") - .emit(); - None - } - name => Some(Symbol::intern(name)) - } - } - - /// PRECONDITION: self.ch is not whitespace - /// Eats any kind of comment. - fn scan_comment(&mut self) -> Option { - if let Some(c) = self.ch { - if c.is_whitespace() { - let msg = "called consume_any_line_comment, but there was whitespace"; - self.sess.span_diagnostic.span_err(self.mk_sp(self.pos, self.pos), msg); - } - } - - if self.ch_is('/') { - match self.nextch() { - Some('/') => { - self.bump(); - self.bump(); - - // line comments starting with "///" or "//!" are doc-comments - let doc_comment = (self.ch_is('/') && !self.nextch_is('/')) || self.ch_is('!'); - let start_bpos = self.pos - BytePos(2); - - while !self.is_eof() { - match self.ch.unwrap() { - '\n' => break, - '\r' => { - if self.nextch_is('\n') { - // CRLF - break; - } else if doc_comment { - self.err_span_(self.pos, - self.next_pos, - "bare CR not allowed in doc-comment"); - } - } - _ => (), - } - self.bump(); - } - - let kind = if doc_comment { - token::DocComment(self.symbol_from(start_bpos)) - } else { - token::Comment - }; - Some(Token::new(kind, self.mk_sp(start_bpos, self.pos))) - } - Some('*') => { - self.bump(); - self.bump(); - self.scan_block_comment() - } - _ => None, - } - } else if self.ch_is('#') { - if self.nextch_is('!') { - - // Parse an inner attribute. - if self.nextnextch_is('[') { - return None; - } - - let is_beginning_of_file = self.pos == self.source_file.start_pos; - if is_beginning_of_file { - debug!("skipping a shebang"); - let start = self.pos; - while !self.ch_is('\n') && !self.is_eof() { - self.bump(); - } - return Some(Token::new( - token::Shebang(self.symbol_from(start)), - self.mk_sp(start, self.pos), - )); - } - } - None - } else { - None - } - } - - /// If there is whitespace, shebang, or a comment, scan it. Otherwise, - /// return `None`. - fn scan_whitespace_or_comment(&mut self) -> Option { - match self.ch.unwrap_or('\0') { - // # to handle shebang at start of file -- this is the entry point - // for skipping over all "junk" - '/' | '#' => { - let c = self.scan_comment(); - debug!("scanning a comment {:?}", c); - c - }, - c if is_pattern_whitespace(Some(c)) => { - let start_bpos = self.pos; - while is_pattern_whitespace(self.ch) { - self.bump(); - } - let c = Some(Token::new(token::Whitespace, self.mk_sp(start_bpos, self.pos))); - debug!("scanning whitespace: {:?}", c); - c - } - _ => None, - } - } - - /// Might return a sugared-doc-attr - fn scan_block_comment(&mut self) -> Option { - // block comments starting with "/**" or "/*!" are doc-comments - let is_doc_comment = self.ch_is('*') || self.ch_is('!'); - let start_bpos = self.pos - BytePos(2); - - let mut level: isize = 1; - let mut has_cr = false; - while level > 0 { - if self.is_eof() { - let msg = if is_doc_comment { - "unterminated block doc-comment" - } else { - "unterminated block comment" - }; - let last_bpos = self.pos; - self.fatal_span_(start_bpos, last_bpos, msg).raise(); - } - let n = self.ch.unwrap(); - match n { - '/' if self.nextch_is('*') => { - level += 1; - self.bump(); - } - '*' if self.nextch_is('/') => { - level -= 1; - self.bump(); - } - '\r' => { - has_cr = true; - } - _ => (), - } - self.bump(); - } - - let string = self.str_from(start_bpos); - // but comments with only "*"s between two "/"s are not - let kind = if is_block_doc_comment(string) { - let string = if has_cr { - self.translate_crlf(start_bpos, - string, - "bare CR not allowed in block doc-comment") - } else { - string.into() - }; - token::DocComment(Symbol::intern(&string[..])) - } else { - token::Comment - }; - - Some(Token::new(kind, self.mk_sp(start_bpos, self.pos))) - } - - /// Scan through any digits (base `scan_radix`) or underscores, - /// and return how many digits there were. - /// - /// `real_radix` represents the true radix of the number we're - /// interested in, and errors will be emitted for any digits - /// between `real_radix` and `scan_radix`. - fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize { - assert!(real_radix <= scan_radix); - let mut len = 0; - - loop { - let c = self.ch; - if c == Some('_') { - debug!("skipping a _"); - self.bump(); - continue; - } - match c.and_then(|cc| cc.to_digit(scan_radix)) { - Some(_) => { - debug!("{:?} in scan_digits", c); - // check that the hypothetical digit is actually - // in range for the true radix - if c.unwrap().to_digit(real_radix).is_none() { - self.err_span_(self.pos, - self.next_pos, - &format!("invalid digit for a base {} literal", real_radix)); - } - len += 1; - self.bump(); - } - _ => return len, - } - } - } - - /// Lex a LIT_INTEGER or a LIT_FLOAT - fn scan_number(&mut self, c: char) -> (token::LitKind, Symbol) { - let mut base = 10; - let start_bpos = self.pos; - self.bump(); - - let num_digits = if c == '0' { - match self.ch.unwrap_or('\0') { - 'b' => { - self.bump(); - base = 2; - self.scan_digits(2, 10) - } - 'o' => { - self.bump(); - base = 8; - self.scan_digits(8, 10) - } - 'x' => { - self.bump(); - base = 16; - self.scan_digits(16, 16) - } - '0'..='9' | '_' | '.' | 'e' | 'E' => { - self.scan_digits(10, 10) + 1 - } - _ => { - // just a 0 - return (token::Integer, sym::integer(0)); - } - } - } else if c.is_digit(10) { - self.scan_digits(10, 10) + 1 - } else { - 0 - }; - - if num_digits == 0 { - self.err_span_(start_bpos, self.pos, "no valid digits found for number"); - - return (token::Integer, sym::integer(0)); - } - - // might be a float, but don't be greedy if this is actually an - // integer literal followed by field/method access or a range pattern - // (`0..2` and `12.foo()`) - if self.ch_is('.') && !self.nextch_is('.') && - !ident_start(self.nextch()) { - // might have stuff after the ., and if it does, it needs to start - // with a number - self.bump(); - if self.ch.unwrap_or('\0').is_digit(10) { - self.scan_digits(10, 10); - self.scan_float_exponent(); - } - let pos = self.pos; - self.check_float_base(start_bpos, pos, base); - - (token::Float, self.symbol_from(start_bpos)) - } else { - // it might be a float if it has an exponent - if self.ch_is('e') || self.ch_is('E') { - self.scan_float_exponent(); - let pos = self.pos; - self.check_float_base(start_bpos, pos, base); - return (token::Float, self.symbol_from(start_bpos)); - } - // but we certainly have an integer! - (token::Integer, self.symbol_from(start_bpos)) - } - } - - /// Scan over a float exponent. - fn scan_float_exponent(&mut self) { - if self.ch_is('e') || self.ch_is('E') { - self.bump(); - - if self.ch_is('-') || self.ch_is('+') { - self.bump(); - } - - if self.scan_digits(10, 10) == 0 { - let mut err = self.struct_span_fatal( - self.pos, self.next_pos, - "expected at least one digit in exponent" - ); - if let Some(ch) = self.ch { - // check for e.g., Unicode minus '−' (Issue #49746) - if unicode_chars::check_for_substitution(self, ch, &mut err) { - self.bump(); - self.scan_digits(10, 10); - } - } - err.emit(); - } - } - } - - /// Checks that a base is valid for a floating literal, emitting a nice - /// error if it isn't. - fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) { - match base { - 16 => { - self.err_span_(start_bpos, - last_bpos, - "hexadecimal float literal is not supported") - } - 8 => { - self.err_span_(start_bpos, - last_bpos, - "octal float literal is not supported") - } - 2 => { - self.err_span_(start_bpos, - last_bpos, - "binary float literal is not supported") - } - _ => (), - } - } - - fn binop(&mut self, op: token::BinOpToken) -> TokenKind { - self.bump(); - if self.ch_is('=') { - self.bump(); - token::BinOpEq(op) - } else { - token::BinOp(op) - } - } - - /// Returns the next token from the string, advances the input past that - /// token, and updates the interner - fn next_token_inner(&mut self) -> Result { - let c = self.ch; - - if ident_start(c) { - let (is_ident_start, is_raw_ident) = - match (c.unwrap(), self.nextch(), self.nextnextch()) { - // r# followed by an identifier starter is a raw identifier. - // This is an exception to the r# case below. - ('r', Some('#'), x) if ident_start(x) => (true, true), - // r as in r" or r#" is part of a raw string literal. - // b as in b' is part of a byte literal. - // They are not identifiers, and are handled further down. - ('r', Some('"'), _) | - ('r', Some('#'), _) | - ('b', Some('"'), _) | - ('b', Some('\''), _) | - ('b', Some('r'), Some('"')) | - ('b', Some('r'), Some('#')) => (false, false), - _ => (true, false), - }; - - if is_ident_start { - let raw_start = self.pos; - if is_raw_ident { - // Consume the 'r#' characters. - self.bump(); - self.bump(); - } - - let start = self.pos; - self.bump(); - - while ident_continue(self.ch) { - self.bump(); - } - - // FIXME: perform NFKC normalization here. (Issue #2253) - let name = self.symbol_from(start); - if is_raw_ident { - let span = self.mk_sp(raw_start, self.pos); - if !name.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", name)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); - } - - return Ok(token::Ident(name, is_raw_ident)); - } - } - - if is_dec_digit(c) { - let (kind, symbol) = self.scan_number(c.unwrap()); - let suffix = self.scan_optional_raw_name(); - debug!("next_token_inner: scanned number {:?}, {:?}, {:?}", kind, symbol, suffix); - return Ok(TokenKind::lit(kind, symbol, suffix)); - } - - match c.expect("next_token_inner called at EOF") { - // One-byte tokens. - ';' => { - self.bump(); - Ok(token::Semi) - } - ',' => { - self.bump(); - Ok(token::Comma) - } - '.' => { - self.bump(); - if self.ch_is('.') { - self.bump(); - if self.ch_is('.') { - self.bump(); - Ok(token::DotDotDot) - } else if self.ch_is('=') { - self.bump(); - Ok(token::DotDotEq) - } else { - Ok(token::DotDot) - } - } else { - Ok(token::Dot) - } - } - '(' => { - self.bump(); - Ok(token::OpenDelim(token::Paren)) - } - ')' => { - self.bump(); - Ok(token::CloseDelim(token::Paren)) - } - '{' => { - self.bump(); - Ok(token::OpenDelim(token::Brace)) - } - '}' => { - self.bump(); - Ok(token::CloseDelim(token::Brace)) - } - '[' => { - self.bump(); - Ok(token::OpenDelim(token::Bracket)) - } - ']' => { - self.bump(); - Ok(token::CloseDelim(token::Bracket)) - } - '@' => { - self.bump(); - Ok(token::At) - } - '#' => { - self.bump(); - Ok(token::Pound) - } - '~' => { - self.bump(); - Ok(token::Tilde) - } - '?' => { - self.bump(); - Ok(token::Question) - } - ':' => { - self.bump(); - if self.ch_is(':') { - self.bump(); - Ok(token::ModSep) - } else { - Ok(token::Colon) - } - } - - '$' => { - self.bump(); - Ok(token::Dollar) - } - - // Multi-byte tokens. - '=' => { - self.bump(); - if self.ch_is('=') { - self.bump(); - Ok(token::EqEq) - } else if self.ch_is('>') { - self.bump(); - Ok(token::FatArrow) - } else { - Ok(token::Eq) - } - } - '!' => { - self.bump(); - if self.ch_is('=') { - self.bump(); - Ok(token::Ne) - } else { - Ok(token::Not) - } - } - '<' => { - self.bump(); - match self.ch.unwrap_or('\x00') { - '=' => { - self.bump(); - Ok(token::Le) - } - '<' => { - Ok(self.binop(token::Shl)) - } - '-' => { - self.bump(); - Ok(token::LArrow) - } - _ => { - Ok(token::Lt) - } - } - } - '>' => { - self.bump(); - match self.ch.unwrap_or('\x00') { - '=' => { - self.bump(); - Ok(token::Ge) - } - '>' => { - Ok(self.binop(token::Shr)) - } - _ => { - Ok(token::Gt) - } - } - } - '\'' => { - // Either a character constant 'a' OR a lifetime name 'abc - let start_with_quote = self.pos; - self.bump(); - let start = self.pos; - - // If the character is an ident start not followed by another single - // quote, then this is a lifetime name: - let starts_with_number = self.ch.unwrap_or('\x00').is_numeric(); - if (ident_start(self.ch) || starts_with_number) && !self.nextch_is('\'') { - self.bump(); - while ident_continue(self.ch) { - self.bump(); - } - // lifetimes shouldn't end with a single quote - // if we find one, then this is an invalid character literal - if self.ch_is('\'') { - let symbol = self.symbol_from(start); - self.bump(); - self.validate_char_escape(start_with_quote); - return Ok(TokenKind::lit(token::Char, symbol, None)); - } - - if starts_with_number { - // this is a recovered lifetime written `'1`, error but accept it - self.err_span_( - start_with_quote, - self.pos, - "lifetimes cannot start with a number", - ); - } - - // Include the leading `'` in the real identifier, for macro - // expansion purposes. See #12512 for the gory details of why - // this is necessary. - return Ok(token::Lifetime(self.symbol_from(start_with_quote))); - } - let msg = "unterminated character literal"; - let symbol = self.scan_single_quoted_string(start_with_quote, msg); - self.validate_char_escape(start_with_quote); - let suffix = self.scan_optional_raw_name(); - Ok(TokenKind::lit(token::Char, symbol, suffix)) - } - 'b' => { - self.bump(); - let (kind, symbol) = match self.ch { - Some('\'') => { - let start_with_quote = self.pos; - self.bump(); - let msg = "unterminated byte constant"; - let symbol = self.scan_single_quoted_string(start_with_quote, msg); - self.validate_byte_escape(start_with_quote); - (token::Byte, symbol) - }, - Some('"') => { - let start_with_quote = self.pos; - let msg = "unterminated double quote byte string"; - let symbol = self.scan_double_quoted_string(msg); - self.validate_byte_str_escape(start_with_quote); - (token::ByteStr, symbol) - }, - Some('r') => { - let (start, end, hash_count) = self.scan_raw_string(); - let symbol = self.symbol_from_to(start, end); - self.validate_raw_byte_str_escape(start, end); - - (token::ByteStrRaw(hash_count), symbol) - } - _ => unreachable!(), // Should have been a token::Ident above. - }; - let suffix = self.scan_optional_raw_name(); - - Ok(TokenKind::lit(kind, symbol, suffix)) - } - '"' => { - let start_with_quote = self.pos; - let msg = "unterminated double quote string"; - let symbol = self.scan_double_quoted_string(msg); - self.validate_str_escape(start_with_quote); - let suffix = self.scan_optional_raw_name(); - Ok(TokenKind::lit(token::Str, symbol, suffix)) - } - 'r' => { - let (start, end, hash_count) = self.scan_raw_string(); - let symbol = self.symbol_from_to(start, end); - self.validate_raw_str_escape(start, end); - let suffix = self.scan_optional_raw_name(); - - Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix)) - } - '-' => { - if self.nextch_is('>') { - self.bump(); - self.bump(); - Ok(token::RArrow) - } else { - Ok(self.binop(token::Minus)) - } - } - '&' => { - if self.nextch_is('&') { - self.bump(); - self.bump(); - Ok(token::AndAnd) - } else { - Ok(self.binop(token::And)) - } - } - '|' => { - match self.nextch() { - Some('|') => { - self.bump(); - self.bump(); - Ok(token::OrOr) - } - _ => { - Ok(self.binop(token::Or)) - } - } - } - '+' => { - Ok(self.binop(token::Plus)) - } - '*' => { - Ok(self.binop(token::Star)) - } - '/' => { - Ok(self.binop(token::Slash)) - } - '^' => { - Ok(self.binop(token::Caret)) - } - '%' => { - Ok(self.binop(token::Percent)) - } - c => { - let last_bpos = self.pos; - let bpos = self.next_pos; - let mut err = self.struct_fatal_span_char(last_bpos, - bpos, - "unknown start of token", - c); - unicode_chars::check_for_substitution(self, c, &mut err); - self.fatal_errs.push(err); - - Err(()) - } - } - } - - fn read_to_eol(&mut self) -> String { - let mut val = String::new(); - while !self.ch_is('\n') && !self.is_eof() { - val.push(self.ch.unwrap()); - self.bump(); - } - - if self.ch_is('\n') { - self.bump(); - } - - val - } - - fn read_one_line_comment(&mut self) -> String { - let val = self.read_to_eol(); - assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') || - (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!')); - val - } - - fn consume_non_eol_whitespace(&mut self) { - while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() { - self.bump(); - } - } - - fn peeking_at_comment(&self) -> bool { - (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) || - // consider shebangs comments, but not inner attributes - (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) - } - - fn scan_single_quoted_string(&mut self, - start_with_quote: BytePos, - unterminated_msg: &str) -> Symbol { - // assumes that first `'` is consumed - let start = self.pos; - // lex `'''` as a single char, for recovery - if self.ch_is('\'') && self.nextch_is('\'') { - self.bump(); - } else { - let mut first = true; - loop { - if self.ch_is('\'') { - break; - } - if self.ch_is('\\') && (self.nextch_is('\'') || self.nextch_is('\\')) { - self.bump(); - self.bump(); - } else { - // Only attempt to infer single line string literals. If we encounter - // a slash, bail out in order to avoid nonsensical suggestion when - // involving comments. - if self.is_eof() - || (self.ch_is('/') && !first) - || (self.ch_is('\n') && !self.nextch_is('\'')) { - - self.fatal_span_(start_with_quote, self.pos, unterminated_msg.into()) - .raise() - } - self.bump(); - } - first = false; - } - } - - let id = self.symbol_from(start); - self.bump(); - id - } - - fn scan_double_quoted_string(&mut self, unterminated_msg: &str) -> Symbol { - debug_assert!(self.ch_is('\"')); - let start_with_quote = self.pos; - self.bump(); - let start = self.pos; - while !self.ch_is('"') { - if self.is_eof() { - let pos = self.pos; - self.fatal_span_(start_with_quote, pos, unterminated_msg).raise(); - } - if self.ch_is('\\') && (self.nextch_is('\\') || self.nextch_is('"')) { - self.bump(); - } - self.bump(); - } - let id = self.symbol_from(start); - self.bump(); - id - } - - /// Scans a raw (byte) string, returning byte position range for `""` - /// (including quotes) along with `#` character count in `(b)r##...""##...`; - fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) { - let start_bpos = self.pos; - self.bump(); - let mut hash_count: u16 = 0; - while self.ch_is('#') { - if hash_count == 65535 { - let bpos = self.next_pos; - self.fatal_span_(start_bpos, - bpos, + fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 { + match n_hashes.try_into() { + Ok(n_hashes) => n_hashes, + Err(_) => { + self.fatal_span_(start, + self.pos, "too many `#` symbols: raw strings may be \ - delimited by up to 65535 `#` symbols").raise(); + delimited by up to 65535 `#` symbols").raise(); } - self.bump(); - hash_count += 1; } - - if self.is_eof() { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } else if !self.ch_is('"') { - let last_bpos = self.pos; - let curr_char = self.ch.unwrap(); - self.fatal_span_char(start_bpos, - last_bpos, - "found invalid character; only `#` is allowed \ - in raw string delimitation", - curr_char).raise(); - } - self.bump(); - let content_start_bpos = self.pos; - let mut content_end_bpos; - 'outer: loop { - match self.ch { - None => { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } - Some('"') => { - content_end_bpos = self.pos; - for _ in 0..hash_count { - self.bump(); - if !self.ch_is('#') { - continue 'outer; - } - } - break; - } - _ => (), - } - self.bump(); - } - - self.bump(); - - (content_start_bpos, content_end_bpos, hash_count) } - fn validate_char_escape(&self, start_with_quote: BytePos) { - let lit = self.str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1)); + fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) { + let lit = self.str_from_to(content_start, content_end); if let Err((off, err)) = unescape::unescape_char(lit) { emit_unescape_error( &self.sess.span_diagnostic, lit, - self.mk_sp(start_with_quote, self.pos), + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), unescape::Mode::Char, 0..off, err, @@ -1214,13 +656,13 @@ impl<'a> StringReader<'a> { } } - fn validate_byte_escape(&self, start_with_quote: BytePos) { - let lit = self.str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1)); + fn validate_byte_escape(&self, content_start: BytePos, content_end: BytePos) { + let lit = self.str_from_to(content_start, content_end); if let Err((off, err)) = unescape::unescape_byte(lit) { emit_unescape_error( &self.sess.span_diagnostic, lit, - self.mk_sp(start_with_quote, self.pos), + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), unescape::Mode::Byte, 0..off, err, @@ -1228,14 +670,14 @@ impl<'a> StringReader<'a> { } } - fn validate_str_escape(&self, start_with_quote: BytePos) { - let lit = self.str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1)); + fn validate_str_escape(&self, content_start: BytePos, content_end: BytePos) { + let lit = self.str_from_to(content_start, content_end); unescape::unescape_str(lit, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, lit, - self.mk_sp(start_with_quote, self.pos), + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), unescape::Mode::Str, range, err, @@ -1276,14 +718,14 @@ impl<'a> StringReader<'a> { }) } - fn validate_byte_str_escape(&self, start_with_quote: BytePos) { - let lit = self.str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1)); + fn validate_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { + let lit = self.str_from_to(content_start, content_end); unescape::unescape_byte_str(lit, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, lit, - self.mk_sp(start_with_quote, self.pos), + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), unescape::Mode::ByteStr, range, err, @@ -1291,23 +733,25 @@ impl<'a> StringReader<'a> { } }) } -} -// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which -// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3 -#[inline] -crate fn is_pattern_whitespace(c: Option) -> bool { - c.map_or(false, Pattern_White_Space) -} + fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { + let base = match base { + Base::Binary => 2, + Base::Octal => 8, + _ => return, + }; + let s = self.str_from_to(content_start + BytePos(2), content_end); + for (idx, c) in s.char_indices() { + let idx = idx as u32; + if c != '_' && c.to_digit(base).is_none() { + let lo = content_start + BytePos(2 + idx); + let hi = content_start + BytePos(2 + idx + c.len_utf8() as u32); + self.err_span_(lo, hi, + &format!("invalid digit for a base {} literal", base)); -#[inline] -fn in_range(c: Option, lo: char, hi: char) -> bool { - c.map_or(false, |c| lo <= c && c <= hi) -} - -#[inline] -fn is_dec_digit(c: Option) -> bool { - in_range(c, '0', '9') + } + } + } } fn is_doc_comment(s: &str) -> bool { @@ -1325,31 +769,6 @@ fn is_block_doc_comment(s: &str) -> bool { res } -/// Determine whether `c` is a valid start for an ident. -fn ident_start(c: Option) -> bool { - let c = match c { - Some(c) => c, - None => return false, - }; - - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start()) -} - -fn ident_continue(c: Option) -> bool { - let c = match c { - Some(c) => c, - None => return false, - }; - - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || - (c > '\x7f' && c.is_xid_continue()) -} - -#[inline] -fn char_at(s: &str, byte: usize) -> char { - s[byte..].chars().next().unwrap() -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs index 6a870685938..e51657c0f13 100644 --- a/src/libsyntax/parse/lexer/unicode_chars.rs +++ b/src/libsyntax/parse/lexer/unicode_chars.rs @@ -3,7 +3,7 @@ use super::StringReader; use errors::{Applicability, DiagnosticBuilder}; -use syntax_pos::{Pos, Span, NO_EXPANSION}; +use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; #[rustfmt::skip] // for line breaks const UNICODE_ARRAY: &[(char, &str, char)] = &[ @@ -327,6 +327,7 @@ const ASCII_ARRAY: &[(char, &str)] = &[ crate fn check_for_substitution<'a>( reader: &StringReader<'a>, + pos: BytePos, ch: char, err: &mut DiagnosticBuilder<'a>, ) -> bool { @@ -335,19 +336,19 @@ crate fn check_for_substitution<'a>( None => return false, }; - let span = Span::new(reader.pos, reader.next_pos, NO_EXPANSION); + let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION); let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) { Some((_ascii_char, ascii_name)) => ascii_name, None => { let msg = format!("substitution character not found for '{}'", ch); reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); - return false - }, + return false; + } }; // special help suggestion for "directed" double quotes - if let Some(s) = reader.peek_delimited('“', '”') { + if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') { let msg = format!( "Unicode characters '“' (Left Double Quotation Mark) and \ '”' (Right Double Quotation Mark) look like '{}' ({}), but are not", @@ -355,8 +356,8 @@ crate fn check_for_substitution<'a>( ); err.span_suggestion( Span::new( - reader.pos, - reader.next_pos + Pos::from_usize(s.len()) + Pos::from_usize('”'.len_utf8()), + pos, + pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()), NO_EXPANSION, ), &msg, @@ -368,26 +369,18 @@ crate fn check_for_substitution<'a>( "Unicode character '{}' ({}) looks like '{}' ({}), but it is not", ch, u_name, ascii_char, ascii_name ); - err.span_suggestion( - span, - &msg, - ascii_char.to_string(), - Applicability::MaybeIncorrect, - ); + err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect); } true } -impl StringReader<'_> { - /// Immutably extract string if found at current position with given delimiters - fn peek_delimited(&self, from_ch: char, to_ch: char) -> Option<&str> { - let tail = &self.src[self.src_index(self.pos)..]; - let mut chars = tail.chars(); - let first_char = chars.next()?; - if first_char != from_ch { - return None; - } - let last_char_idx = chars.as_str().find(to_ch)?; - Some(&chars.as_str()[..last_char_idx]) +/// Extract string if found at current position with given delimiters +fn peek_delimited(text: &str, from_ch: char, to_ch: char) -> Option<&str> { + let mut chars = text.chars(); + let first_char = chars.next()?; + if first_char != from_ch { + return None; } + let last_char_idx = chars.as_str().find(to_ch)?; + Some(&chars.as_str()[..last_char_idx]) } diff --git a/src/libsyntax/util/parser_testing.rs b/src/libsyntax/util/parser_testing.rs index 2ef32d37d44..627422df1db 100644 --- a/src/libsyntax/util/parser_testing.rs +++ b/src/libsyntax/util/parser_testing.rs @@ -1,7 +1,7 @@ use crate::ast::{self, Ident}; use crate::source_map::FilePathMapping; use crate::parse::{ParseSess, PResult, source_file_to_stream}; -use crate::parse::{lexer, new_parser_from_source_str}; +use crate::parse::new_parser_from_source_str; use crate::parse::parser::Parser; use crate::ptr::P; use crate::tokenstream::TokenStream; @@ -113,14 +113,14 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool { } /// Advances the given peekable `Iterator` until it reaches a non-whitespace character -fn scan_for_non_ws_or_end>(iter: &mut Peekable) { - while lexer::is_pattern_whitespace(iter.peek().cloned()) { +fn scan_for_non_ws_or_end>(iter: &mut Peekable) { + while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) { iter.next(); } } pub fn is_pattern_whitespace(c: char) -> bool { - lexer::is_pattern_whitespace(Some(c)) + rustc_lexer::character_properties::is_whitespace(c) } #[cfg(test)] diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs index b4795e76c98..5c2c3b8ec61 100644 --- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs +++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs @@ -1,4 +1,5 @@ const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻² //~^ ERROR expected at least one digit in exponent +//~| ERROR unknown start of token: \u{2212} fn main() {} diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr index 253deb2be6e..07653c791db 100644 --- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr +++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr @@ -1,4 +1,10 @@ error: expected at least one digit in exponent + --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:47 + | +LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻² + | ^^^^^^ + +error: unknown start of token: \u{2212} --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53 | LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻² @@ -8,5 +14,5 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻² | ^ -error: aborting due to previous error +error: aborting due to 2 previous errors diff --git a/src/test/ui/parser/lex-bad-numeric-literals.stderr b/src/test/ui/parser/lex-bad-numeric-literals.stderr index 84e27f7366d..151480dd012 100644 --- a/src/test/ui/parser/lex-bad-numeric-literals.stderr +++ b/src/test/ui/parser/lex-bad-numeric-literals.stderr @@ -53,10 +53,10 @@ LL | 0o; | ^^ error: expected at least one digit in exponent - --> $DIR/lex-bad-numeric-literals.rs:12:8 + --> $DIR/lex-bad-numeric-literals.rs:12:5 | LL | 1e+; - | ^ + | ^^^ error: hexadecimal float literal is not supported --> $DIR/lex-bad-numeric-literals.rs:13:5 diff --git a/src/test/ui/parser/raw-byte-string-eof.stderr b/src/test/ui/parser/raw-byte-string-eof.stderr index 2ba50e8fb2a..65fa89f2a81 100644 --- a/src/test/ui/parser/raw-byte-string-eof.stderr +++ b/src/test/ui/parser/raw-byte-string-eof.stderr @@ -1,8 +1,8 @@ error: unterminated raw string - --> $DIR/raw-byte-string-eof.rs:2:6 + --> $DIR/raw-byte-string-eof.rs:2:5 | LL | br##"a"#; - | ^ unterminated raw string + | ^ unterminated raw string | = note: this raw string should be terminated with `"##` diff --git a/src/test/ui/parser/raw-byte-string-literals.stderr b/src/test/ui/parser/raw-byte-string-literals.stderr index 4880d1fdbe8..4076fe334e6 100644 --- a/src/test/ui/parser/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw-byte-string-literals.stderr @@ -11,10 +11,10 @@ LL | br"é"; | ^ error: found invalid character; only `#` is allowed in raw string delimitation: ~ - --> $DIR/raw-byte-string-literals.rs:6:6 + --> $DIR/raw-byte-string-literals.rs:6:5 | LL | br##~"a"~##; - | ^^^ + | ^^^^^ error: aborting due to 3 previous errors diff --git a/src/test/ui/parser/raw-str-delim.stderr b/src/test/ui/parser/raw-str-delim.stderr index b86b9e90e73..8a04f99a126 100644 --- a/src/test/ui/parser/raw-str-delim.stderr +++ b/src/test/ui/parser/raw-str-delim.stderr @@ -2,7 +2,7 @@ error: found invalid character; only `#` is allowed in raw string delimitation: --> $DIR/raw-str-delim.rs:2:5 | LL | r#~"#"~# - | ^^ + | ^^^ error: aborting due to previous error