Auto merge of #43716 - MaloJaffre:_-in-literals, r=petrochenkov

Accept underscores in unicode escapes Fixes #43692. I don't know if this need an RFC, but at least the impl is here!
2025-01-12 07:43:31 +00:00 · 2017-09-12 01:25:23 +00:00 · 2017-09-12 01:25:23 +00:00 · 11f64d8f88
commit 11f64d8f88
parent 07d950f38f d4e0e52281
8 changed files with 102 additions and 57 deletions
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@ -963,60 +963,67 @@ impl<'a> StringReader<'a> {
        true
    }

-    /// Scan over a \u{...} escape
+    /// Scan over a `\u{...}` escape
    ///
-    /// At this point, we have already seen the \ and the u, the { is the current character. We
-    /// will read at least one digit, and up to 6, and pass over the }.
+    /// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
+    /// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
+    /// and pass over the `}`.
    fn scan_unicode_escape(&mut self, delim: char) -> bool {
        self.bump(); // past the {
        let start_bpos = self.pos;
-        let mut count = 0;
-        let mut accum_int = 0;
        let mut valid = true;

-        while !self.ch_is('}') && count <= 6 {
-            let c = match self.ch {
-                Some(c) => c,
+        if let Some('_') = self.ch {
+            // disallow leading `_`
+            self.err_span_(self.pos,
+                           self.next_pos,
+                           "invalid start of unicode escape");
+            valid = false;
+        }
+
+        let count = self.scan_digits(16, 16);
+
+        if count > 6 {
+            self.err_span_(start_bpos,
+                           self.pos,
+                           "overlong unicode escape (must have at most 6 hex digits)");
+            valid = false;
+        }
+        loop {
+            match self.ch {
+                Some('}') => {
+                    if valid && count == 0 {
+                        self.err_span_(start_bpos,
+                                       self.pos,
+                                       "empty unicode escape (must have at least 1 hex digit)");
+                        valid = false;
+                    }
+                    self.bump(); // past the ending `}`
+                    break;
+                },
+                Some(c) => {
+                    if c == delim {
+                        self.err_span_(self.pos,
+                                       self.pos,
+                                       "unterminated unicode escape (needed a `}`)");
+                        valid = false;
+                        break;
+                    } else if valid {
+                        self.err_span_char(start_bpos,
+                                           self.pos,
+                                           "invalid character in unicode escape",
+                                           c);
+                        valid = false;
+                    }
+                },
                None => {
                    panic!(self.fatal_span_(start_bpos,
                                            self.pos,
                                            "unterminated unicode escape (found EOF)"));
                }
-            };
-            accum_int *= 16;
-            accum_int += c.to_digit(16).unwrap_or_else(|| {
-                if c == delim {
-                    panic!(self.fatal_span_(self.pos,
-                                            self.next_pos,
-                                            "unterminated unicode escape (needed a `}`)"));
-                } else {
-                    self.err_span_char(self.pos,
-                                       self.next_pos,
-                                       "invalid character in unicode escape",
-                                       c);
            }
-                valid = false;
-                0
-            });
            self.bump();
-            count += 1;
        }
-
-        if count > 6 {
-            self.err_span_(start_bpos,
-                           self.pos,
-                           "overlong unicode escape (can have at most 6 hex digits)");
-            valid = false;
-        }
-
-        if valid && (char::from_u32(accum_int).is_none() || count == 0) {
-            self.err_span_(start_bpos,
-                           self.pos,
-                           "invalid unicode character escape");
-            valid = false;
-        }
-
-        self.bump(); // past the ending }
        valid
    }

--- a/src/libsyntax/parse/mod.rs
+++ b/src/libsyntax/parse/mod.rs
@ -230,7 +230,7 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser {
 /// Rather than just accepting/rejecting a given literal, unescapes it as
 /// well. Can take any slice prefixed by a character escape. Returns the
 /// character and the number of characters consumed.
-pub fn char_lit(lit: &str) -> (char, isize) {
+pub fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
    use std::char;

    // Handle non-escaped chars first.
@ -258,8 +258,19 @@ pub fn char_lit(lit: &str) -> (char, isize) {
        'u' => {
            assert_eq!(lit.as_bytes()[2], b'{');
            let idx = lit.find('}').unwrap();
-            let v = u32::from_str_radix(&lit[3..idx], 16).unwrap();
-            let c = char::from_u32(v).unwrap();
+            let s = &lit[3..idx].chars().filter(|&c| c != '_').collect::<String>();
+            let v = u32::from_str_radix(&s, 16).unwrap();
+            let c = char::from_u32(v).unwrap_or_else(|| {
+                if let Some((span, diag)) = diag {
+                    let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
+                    if v > 0x10FFFF {
+                        diag.help("unicode escape must be at most 10FFFF").emit();
+                    } else {
+                        diag.help("unicode escape must not be a surrogate").emit();
+                    }
+                }
+                '\u{FFFD}'
+            });
            (c, (idx + 1) as isize)
        }
        _ => panic!("lexer should have rejected a bad character escape {}", lit)
@ -272,7 +283,7 @@ pub fn escape_default(s: &str) -> String {

 /// Parse a string representing a string literal into its final form. Does
 /// unescaping.
-pub fn str_lit(lit: &str) -> String {
+pub fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
    debug!("parse_str_lit: given {}", escape_default(lit));
    let mut res = String::with_capacity(lit.len());

@ -313,7 +324,7 @@ pub fn str_lit(lit: &str) -> String {
                    eat(&mut chars);
                } else {
                    // otherwise, a normal escape
-                    let (c, n) = char_lit(&lit[i..]);
+                    let (c, n) = char_lit(&lit[i..], diag);
                    for _ in 0..n - 1 { // we don't need to move past the first \
                        chars.next();
                    }
@ -385,7 +396,7 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand

    match lit {
       token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
-       token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str()).0))),
+       token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),

        // There are some valid suffixes for integer and float literals,
        // so all the handling is done internally.
@ -393,7 +404,7 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand
        token::Float(s) => (false, float_lit(&s.as_str(), suf, diag)),

        token::Str_(s) => {
-            let s = Symbol::intern(&str_lit(&s.as_str()));
+            let s = Symbol::intern(&str_lit(&s.as_str(), diag));
            (true, Some(LitKind::Str(s, ast::StrStyle::Cooked)))
        }
        token::StrRaw(s, n) => {
--- a/src/test/parse-fail/issue-23620-invalid-escapes.rs
+++ b/src/test/parse-fail/issue-23620-invalid-escapes.rs
@ -41,9 +41,8 @@ fn main() {
    //~^^^ ERROR incorrect unicode escape sequence
    //~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string

-    let _ = "\u{ffffff} \xf \u";
-    //~^ ERROR invalid unicode character escape
-    //~^^ ERROR invalid character in numeric character escape:
-    //~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
-    //~^^^^ ERROR incorrect unicode escape sequence
+    let _ = "\xf \u";
+    //~^ ERROR invalid character in numeric character escape:
+    //~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
+    //~^^^ ERROR incorrect unicode escape sequence
 }
--- a/src/test/parse-fail/issue-43692.rs
+++ b/src/test/parse-fail/issue-43692.rs
@ -0,0 +1,15 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -Z parse-only
+
+fn main() {
+    '\u{_10FFFF}'; //~ ERROR invalid start of unicode escape
+}
--- a/src/test/parse-fail/new-unicode-escapes-2.rs
+++ b/src/test/parse-fail/new-unicode-escapes-2.rs
@ -11,5 +11,5 @@
 // compile-flags: -Z parse-only

 pub fn main() {
-    let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits)
+    let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (must have at most 6 hex digits)
 }
--- a/src/test/parse-fail/new-unicode-escapes-3.rs
+++ b/src/test/parse-fail/new-unicode-escapes-3.rs
@ -11,5 +11,6 @@
 // compile-flags: -Z parse-only

 pub fn main() {
-    let s = "\u{d805}"; //~ ERROR invalid unicode character escape
+    let s1 = "\u{d805}"; //~ ERROR invalid unicode character escape
+    let s2 = "\u{ffffff}"; //~ ERROR invalid unicode character escape
 }
--- a/src/test/parse-fail/new-unicode-escapes-4.rs
+++ b/src/test/parse-fail/new-unicode-escapes-4.rs
@ -13,6 +13,4 @@
 pub fn main() {
    let s = "\u{lol}";
     //~^ ERROR invalid character in unicode escape: l
-     //~^^ ERROR invalid character in unicode escape: o
-     //~^^^ ERROR invalid character in unicode escape: l
 }
--- a/src/test/run-pass/issue-43692.rs
+++ b/src/test/run-pass/issue-43692.rs
@ -0,0 +1,14 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+fn main() {
+    assert_eq!('\u{10__FFFF}', '\u{10FFFF}');
+    assert_eq!("\u{10_F0FF__}foo\u{1_0_0_0__}", "\u{10F0FF}foo\u{1000}");
+}