mirror of
https://github.com/rust-lang/rust.git
synced 2025-01-12 07:43:31 +00:00
Auto merge of #43716 - MaloJaffre:_-in-literals, r=petrochenkov
Accept underscores in unicode escapes Fixes #43692. I don't know if this need an RFC, but at least the impl is here!
This commit is contained in:
commit
11f64d8f88
@ -963,60 +963,67 @@ impl<'a> StringReader<'a> {
|
||||
true
|
||||
}
|
||||
|
||||
/// Scan over a \u{...} escape
|
||||
/// Scan over a `\u{...}` escape
|
||||
///
|
||||
/// At this point, we have already seen the \ and the u, the { is the current character. We
|
||||
/// will read at least one digit, and up to 6, and pass over the }.
|
||||
/// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
|
||||
/// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
|
||||
/// and pass over the `}`.
|
||||
fn scan_unicode_escape(&mut self, delim: char) -> bool {
|
||||
self.bump(); // past the {
|
||||
let start_bpos = self.pos;
|
||||
let mut count = 0;
|
||||
let mut accum_int = 0;
|
||||
let mut valid = true;
|
||||
|
||||
while !self.ch_is('}') && count <= 6 {
|
||||
let c = match self.ch {
|
||||
Some(c) => c,
|
||||
if let Some('_') = self.ch {
|
||||
// disallow leading `_`
|
||||
self.err_span_(self.pos,
|
||||
self.next_pos,
|
||||
"invalid start of unicode escape");
|
||||
valid = false;
|
||||
}
|
||||
|
||||
let count = self.scan_digits(16, 16);
|
||||
|
||||
if count > 6 {
|
||||
self.err_span_(start_bpos,
|
||||
self.pos,
|
||||
"overlong unicode escape (must have at most 6 hex digits)");
|
||||
valid = false;
|
||||
}
|
||||
loop {
|
||||
match self.ch {
|
||||
Some('}') => {
|
||||
if valid && count == 0 {
|
||||
self.err_span_(start_bpos,
|
||||
self.pos,
|
||||
"empty unicode escape (must have at least 1 hex digit)");
|
||||
valid = false;
|
||||
}
|
||||
self.bump(); // past the ending `}`
|
||||
break;
|
||||
},
|
||||
Some(c) => {
|
||||
if c == delim {
|
||||
self.err_span_(self.pos,
|
||||
self.pos,
|
||||
"unterminated unicode escape (needed a `}`)");
|
||||
valid = false;
|
||||
break;
|
||||
} else if valid {
|
||||
self.err_span_char(start_bpos,
|
||||
self.pos,
|
||||
"invalid character in unicode escape",
|
||||
c);
|
||||
valid = false;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
panic!(self.fatal_span_(start_bpos,
|
||||
self.pos,
|
||||
"unterminated unicode escape (found EOF)"));
|
||||
}
|
||||
};
|
||||
accum_int *= 16;
|
||||
accum_int += c.to_digit(16).unwrap_or_else(|| {
|
||||
if c == delim {
|
||||
panic!(self.fatal_span_(self.pos,
|
||||
self.next_pos,
|
||||
"unterminated unicode escape (needed a `}`)"));
|
||||
} else {
|
||||
self.err_span_char(self.pos,
|
||||
self.next_pos,
|
||||
"invalid character in unicode escape",
|
||||
c);
|
||||
}
|
||||
valid = false;
|
||||
0
|
||||
});
|
||||
self.bump();
|
||||
count += 1;
|
||||
}
|
||||
|
||||
if count > 6 {
|
||||
self.err_span_(start_bpos,
|
||||
self.pos,
|
||||
"overlong unicode escape (can have at most 6 hex digits)");
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if valid && (char::from_u32(accum_int).is_none() || count == 0) {
|
||||
self.err_span_(start_bpos,
|
||||
self.pos,
|
||||
"invalid unicode character escape");
|
||||
valid = false;
|
||||
}
|
||||
|
||||
self.bump(); // past the ending }
|
||||
valid
|
||||
}
|
||||
|
||||
|
@ -230,7 +230,7 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser {
|
||||
/// Rather than just accepting/rejecting a given literal, unescapes it as
|
||||
/// well. Can take any slice prefixed by a character escape. Returns the
|
||||
/// character and the number of characters consumed.
|
||||
pub fn char_lit(lit: &str) -> (char, isize) {
|
||||
pub fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
|
||||
use std::char;
|
||||
|
||||
// Handle non-escaped chars first.
|
||||
@ -258,8 +258,19 @@ pub fn char_lit(lit: &str) -> (char, isize) {
|
||||
'u' => {
|
||||
assert_eq!(lit.as_bytes()[2], b'{');
|
||||
let idx = lit.find('}').unwrap();
|
||||
let v = u32::from_str_radix(&lit[3..idx], 16).unwrap();
|
||||
let c = char::from_u32(v).unwrap();
|
||||
let s = &lit[3..idx].chars().filter(|&c| c != '_').collect::<String>();
|
||||
let v = u32::from_str_radix(&s, 16).unwrap();
|
||||
let c = char::from_u32(v).unwrap_or_else(|| {
|
||||
if let Some((span, diag)) = diag {
|
||||
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
|
||||
if v > 0x10FFFF {
|
||||
diag.help("unicode escape must be at most 10FFFF").emit();
|
||||
} else {
|
||||
diag.help("unicode escape must not be a surrogate").emit();
|
||||
}
|
||||
}
|
||||
'\u{FFFD}'
|
||||
});
|
||||
(c, (idx + 1) as isize)
|
||||
}
|
||||
_ => panic!("lexer should have rejected a bad character escape {}", lit)
|
||||
@ -272,7 +283,7 @@ pub fn escape_default(s: &str) -> String {
|
||||
|
||||
/// Parse a string representing a string literal into its final form. Does
|
||||
/// unescaping.
|
||||
pub fn str_lit(lit: &str) -> String {
|
||||
pub fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
|
||||
debug!("parse_str_lit: given {}", escape_default(lit));
|
||||
let mut res = String::with_capacity(lit.len());
|
||||
|
||||
@ -313,7 +324,7 @@ pub fn str_lit(lit: &str) -> String {
|
||||
eat(&mut chars);
|
||||
} else {
|
||||
// otherwise, a normal escape
|
||||
let (c, n) = char_lit(&lit[i..]);
|
||||
let (c, n) = char_lit(&lit[i..], diag);
|
||||
for _ in 0..n - 1 { // we don't need to move past the first \
|
||||
chars.next();
|
||||
}
|
||||
@ -385,7 +396,7 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand
|
||||
|
||||
match lit {
|
||||
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
|
||||
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str()).0))),
|
||||
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
|
||||
|
||||
// There are some valid suffixes for integer and float literals,
|
||||
// so all the handling is done internally.
|
||||
@ -393,7 +404,7 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand
|
||||
token::Float(s) => (false, float_lit(&s.as_str(), suf, diag)),
|
||||
|
||||
token::Str_(s) => {
|
||||
let s = Symbol::intern(&str_lit(&s.as_str()));
|
||||
let s = Symbol::intern(&str_lit(&s.as_str(), diag));
|
||||
(true, Some(LitKind::Str(s, ast::StrStyle::Cooked)))
|
||||
}
|
||||
token::StrRaw(s, n) => {
|
||||
|
@ -41,9 +41,8 @@ fn main() {
|
||||
//~^^^ ERROR incorrect unicode escape sequence
|
||||
//~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
|
||||
|
||||
let _ = "\u{ffffff} \xf \u";
|
||||
//~^ ERROR invalid unicode character escape
|
||||
//~^^ ERROR invalid character in numeric character escape:
|
||||
//~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
|
||||
//~^^^^ ERROR incorrect unicode escape sequence
|
||||
let _ = "\xf \u";
|
||||
//~^ ERROR invalid character in numeric character escape:
|
||||
//~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
|
||||
//~^^^ ERROR incorrect unicode escape sequence
|
||||
}
|
||||
|
15
src/test/parse-fail/issue-43692.rs
Normal file
15
src/test/parse-fail/issue-43692.rs
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// compile-flags: -Z parse-only
|
||||
|
||||
fn main() {
|
||||
'\u{_10FFFF}'; //~ ERROR invalid start of unicode escape
|
||||
}
|
@ -11,5 +11,5 @@
|
||||
// compile-flags: -Z parse-only
|
||||
|
||||
pub fn main() {
|
||||
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits)
|
||||
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (must have at most 6 hex digits)
|
||||
}
|
||||
|
@ -11,5 +11,6 @@
|
||||
// compile-flags: -Z parse-only
|
||||
|
||||
pub fn main() {
|
||||
let s = "\u{d805}"; //~ ERROR invalid unicode character escape
|
||||
let s1 = "\u{d805}"; //~ ERROR invalid unicode character escape
|
||||
let s2 = "\u{ffffff}"; //~ ERROR invalid unicode character escape
|
||||
}
|
||||
|
@ -13,6 +13,4 @@
|
||||
pub fn main() {
|
||||
let s = "\u{lol}";
|
||||
//~^ ERROR invalid character in unicode escape: l
|
||||
//~^^ ERROR invalid character in unicode escape: o
|
||||
//~^^^ ERROR invalid character in unicode escape: l
|
||||
}
|
||||
|
14
src/test/run-pass/issue-43692.rs
Normal file
14
src/test/run-pass/issue-43692.rs
Normal file
@ -0,0 +1,14 @@
|
||||
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
fn main() {
|
||||
assert_eq!('\u{10__FFFF}', '\u{10FFFF}');
|
||||
assert_eq!("\u{10_F0FF__}foo\u{1_0_0_0__}", "\u{10F0FF}foo\u{1000}");
|
||||
}
|
Loading…
Reference in New Issue
Block a user