From d4e0e5228111cd47294342a60b5f8af44c65e206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malo=20Jaffr=C3=A9?= Date: Thu, 17 Aug 2017 20:02:13 +0200 Subject: [PATCH] Accept underscores in unicode escapes Fixes #43692. --- src/libsyntax/parse/lexer/mod.rs | 89 ++++++++++--------- src/libsyntax/parse/mod.rs | 25 ++++-- .../parse-fail/issue-23620-invalid-escapes.rs | 9 +- src/test/parse-fail/issue-43692.rs | 15 ++++ src/test/parse-fail/new-unicode-escapes-2.rs | 2 +- src/test/parse-fail/new-unicode-escapes-3.rs | 3 +- src/test/parse-fail/new-unicode-escapes-4.rs | 2 - src/test/run-pass/issue-43692.rs | 14 +++ 8 files changed, 102 insertions(+), 57 deletions(-) create mode 100644 src/test/parse-fail/issue-43692.rs create mode 100644 src/test/run-pass/issue-43692.rs diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 527d2e41396..a80b7a112b0 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -963,60 +963,67 @@ impl<'a> StringReader<'a> { true } - /// Scan over a \u{...} escape + /// Scan over a `\u{...}` escape /// - /// At this point, we have already seen the \ and the u, the { is the current character. We - /// will read at least one digit, and up to 6, and pass over the }. + /// At this point, we have already seen the `\` and the `u`, the `{` is the current character. + /// We will read a hex number (with `_` separators), with 1 to 6 actual digits, + /// and pass over the `}`. fn scan_unicode_escape(&mut self, delim: char) -> bool { self.bump(); // past the { let start_bpos = self.pos; - let mut count = 0; - let mut accum_int = 0; let mut valid = true; - while !self.ch_is('}') && count <= 6 { - let c = match self.ch { - Some(c) => c, + if let Some('_') = self.ch { + // disallow leading `_` + self.err_span_(self.pos, + self.next_pos, + "invalid start of unicode escape"); + valid = false; + } + + let count = self.scan_digits(16, 16); + + if count > 6 { + self.err_span_(start_bpos, + self.pos, + "overlong unicode escape (must have at most 6 hex digits)"); + valid = false; + } + loop { + match self.ch { + Some('}') => { + if valid && count == 0 { + self.err_span_(start_bpos, + self.pos, + "empty unicode escape (must have at least 1 hex digit)"); + valid = false; + } + self.bump(); // past the ending `}` + break; + }, + Some(c) => { + if c == delim { + self.err_span_(self.pos, + self.pos, + "unterminated unicode escape (needed a `}`)"); + valid = false; + break; + } else if valid { + self.err_span_char(start_bpos, + self.pos, + "invalid character in unicode escape", + c); + valid = false; + } + }, None => { panic!(self.fatal_span_(start_bpos, self.pos, "unterminated unicode escape (found EOF)")); } - }; - accum_int *= 16; - accum_int += c.to_digit(16).unwrap_or_else(|| { - if c == delim { - panic!(self.fatal_span_(self.pos, - self.next_pos, - "unterminated unicode escape (needed a `}`)")); - } else { - self.err_span_char(self.pos, - self.next_pos, - "invalid character in unicode escape", - c); - } - valid = false; - 0 - }); + } self.bump(); - count += 1; } - - if count > 6 { - self.err_span_(start_bpos, - self.pos, - "overlong unicode escape (can have at most 6 hex digits)"); - valid = false; - } - - if valid && (char::from_u32(accum_int).is_none() || count == 0) { - self.err_span_(start_bpos, - self.pos, - "invalid unicode character escape"); - valid = false; - } - - self.bump(); // past the ending } valid } diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index 67b4954a8f1..4ef640b1197 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -230,7 +230,7 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser { /// Rather than just accepting/rejecting a given literal, unescapes it as /// well. Can take any slice prefixed by a character escape. Returns the /// character and the number of characters consumed. -pub fn char_lit(lit: &str) -> (char, isize) { +pub fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) { use std::char; // Handle non-escaped chars first. @@ -258,8 +258,19 @@ pub fn char_lit(lit: &str) -> (char, isize) { 'u' => { assert_eq!(lit.as_bytes()[2], b'{'); let idx = lit.find('}').unwrap(); - let v = u32::from_str_radix(&lit[3..idx], 16).unwrap(); - let c = char::from_u32(v).unwrap(); + let s = &lit[3..idx].chars().filter(|&c| c != '_').collect::(); + let v = u32::from_str_radix(&s, 16).unwrap(); + let c = char::from_u32(v).unwrap_or_else(|| { + if let Some((span, diag)) = diag { + let mut diag = diag.struct_span_err(span, "invalid unicode character escape"); + if v > 0x10FFFF { + diag.help("unicode escape must be at most 10FFFF").emit(); + } else { + diag.help("unicode escape must not be a surrogate").emit(); + } + } + '\u{FFFD}' + }); (c, (idx + 1) as isize) } _ => panic!("lexer should have rejected a bad character escape {}", lit) @@ -272,7 +283,7 @@ pub fn escape_default(s: &str) -> String { /// Parse a string representing a string literal into its final form. Does /// unescaping. -pub fn str_lit(lit: &str) -> String { +pub fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String { debug!("parse_str_lit: given {}", escape_default(lit)); let mut res = String::with_capacity(lit.len()); @@ -313,7 +324,7 @@ pub fn str_lit(lit: &str) -> String { eat(&mut chars); } else { // otherwise, a normal escape - let (c, n) = char_lit(&lit[i..]); + let (c, n) = char_lit(&lit[i..], diag); for _ in 0..n - 1 { // we don't need to move past the first \ chars.next(); } @@ -385,7 +396,7 @@ pub fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Hand match lit { token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))), - token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str()).0))), + token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))), // There are some valid suffixes for integer and float literals, // so all the handling is done internally. @@ -393,7 +404,7 @@ pub fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Hand token::Float(s) => (false, float_lit(&s.as_str(), suf, diag)), token::Str_(s) => { - let s = Symbol::intern(&str_lit(&s.as_str())); + let s = Symbol::intern(&str_lit(&s.as_str(), diag)); (true, Some(LitKind::Str(s, ast::StrStyle::Cooked))) } token::StrRaw(s, n) => { diff --git a/src/test/parse-fail/issue-23620-invalid-escapes.rs b/src/test/parse-fail/issue-23620-invalid-escapes.rs index 821149d1d00..dfeaae49002 100644 --- a/src/test/parse-fail/issue-23620-invalid-escapes.rs +++ b/src/test/parse-fail/issue-23620-invalid-escapes.rs @@ -41,9 +41,8 @@ fn main() { //~^^^ ERROR incorrect unicode escape sequence //~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string - let _ = "\u{ffffff} \xf \u"; - //~^ ERROR invalid unicode character escape - //~^^ ERROR invalid character in numeric character escape: - //~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f] - //~^^^^ ERROR incorrect unicode escape sequence + let _ = "\xf \u"; + //~^ ERROR invalid character in numeric character escape: + //~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f] + //~^^^ ERROR incorrect unicode escape sequence } diff --git a/src/test/parse-fail/issue-43692.rs b/src/test/parse-fail/issue-43692.rs new file mode 100644 index 00000000000..eb5d050e102 --- /dev/null +++ b/src/test/parse-fail/issue-43692.rs @@ -0,0 +1,15 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// compile-flags: -Z parse-only + +fn main() { + '\u{_10FFFF}'; //~ ERROR invalid start of unicode escape +} diff --git a/src/test/parse-fail/new-unicode-escapes-2.rs b/src/test/parse-fail/new-unicode-escapes-2.rs index 3eaea86b8bc..cdadaef1b59 100644 --- a/src/test/parse-fail/new-unicode-escapes-2.rs +++ b/src/test/parse-fail/new-unicode-escapes-2.rs @@ -11,5 +11,5 @@ // compile-flags: -Z parse-only pub fn main() { - let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits) + let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (must have at most 6 hex digits) } diff --git a/src/test/parse-fail/new-unicode-escapes-3.rs b/src/test/parse-fail/new-unicode-escapes-3.rs index d12bb63111b..8189bf67712 100644 --- a/src/test/parse-fail/new-unicode-escapes-3.rs +++ b/src/test/parse-fail/new-unicode-escapes-3.rs @@ -11,5 +11,6 @@ // compile-flags: -Z parse-only pub fn main() { - let s = "\u{d805}"; //~ ERROR invalid unicode character escape + let s1 = "\u{d805}"; //~ ERROR invalid unicode character escape + let s2 = "\u{ffffff}"; //~ ERROR invalid unicode character escape } diff --git a/src/test/parse-fail/new-unicode-escapes-4.rs b/src/test/parse-fail/new-unicode-escapes-4.rs index 5615ac8df01..8770fb319df 100644 --- a/src/test/parse-fail/new-unicode-escapes-4.rs +++ b/src/test/parse-fail/new-unicode-escapes-4.rs @@ -13,6 +13,4 @@ pub fn main() { let s = "\u{lol}"; //~^ ERROR invalid character in unicode escape: l - //~^^ ERROR invalid character in unicode escape: o - //~^^^ ERROR invalid character in unicode escape: l } diff --git a/src/test/run-pass/issue-43692.rs b/src/test/run-pass/issue-43692.rs new file mode 100644 index 00000000000..23e96f8c1bf --- /dev/null +++ b/src/test/run-pass/issue-43692.rs @@ -0,0 +1,14 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +fn main() { + assert_eq!('\u{10__FFFF}', '\u{10FFFF}'); + assert_eq!("\u{10_F0FF__}foo\u{1_0_0_0__}", "\u{10F0FF}foo\u{1000}"); +}