Make non-ASCII errors more consistent.

There are three kinds of "byte" literals: byte literals, byte string literals, and raw byte string literals. None are allowed to have non-ASCII chars in them. Two `EscapeError` variants exist for when that constraint is violated. - `NonAsciiCharInByte`: used for byte literals and byte string literals. - `NonAsciiCharInByteString`: used for raw byte string literals. As a result, the messages for raw byte string literals use different wording, without good reason. Also, byte string literals are incorrectly described as "byte constants" in some error messages. This commit eliminates `NonAsciiCharInByteString` so the three cases are handled similarly, and described correctly. The `mode` is enough to distinguish them. Note: Some existing error messages mention "byte constants" and some mention "byte literals". I went with the latter here, because it's a more correct name, as used by the Reference.
2025-01-19 11:12:43 +00:00 · 2022-11-03 15:17:37 +11:00 · 2022-11-03 15:17:37 +11:00 · 7dbf2c0ed8
commit 7dbf2c0ed8
parent 34b32b0dac
15 changed files with 62 additions and 74 deletions
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@ -52,10 +52,8 @@ pub enum EscapeError {

    /// Unicode escape code in byte literal.
    UnicodeEscapeInByte,
-    /// Non-ascii character in byte literal.
+    /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
    NonAsciiCharInByte,
-    /// Non-ascii character in byte string literal.
-    NonAsciiCharInByteString,

    /// After a line ending with '\', the next line contains whitespace
    /// characters that are not skipped.
@ -349,8 +347,7 @@ where
        let start = src.len() - chars.as_str().len() - c.len_utf8();
        let result = match c {
            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString),
-            c => Ok(c),
+            _ => ascii_check(c, is_byte),
        };
        let end = src.len() - chars.as_str().len();
        callback(start..end, result);
--- a/compiler/rustc_lexer/src/unescape/tests.rs
+++ b/compiler/rustc_lexer/src/unescape/tests.rs
@ -289,9 +289,6 @@ fn test_unescape_raw_byte_str() {
    }

    check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
-    check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
-    check(
-        "🦀a",
-        &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))],
-    );
+    check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
+    check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]);
 }
--- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
+++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error(
                .emit();
        }
        EscapeError::NonAsciiCharInByte => {
-            assert!(mode.is_byte());
            let (c, span) = last_char();
-            let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant");
+            let desc = match mode {
+                Mode::Byte => "byte literal",
+                Mode::ByteStr => "byte string literal",
+                Mode::RawByteStr => "raw byte string literal",
+                _ => panic!("non-is_byte literal paired with NonAsciiCharInByte"),
+            };
+            let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc));
            let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
                format!(" but is {:?}", c)
            } else {
                String::new()
            };
-            err.span_label(span, &format!("byte constant must be ASCII{}", postfix));
-            if (c as u32) <= 0xFF {
+            err.span_label(span, &format!("must be ASCII{}", postfix));
+            // Note: the \\xHH suggestions are not given for raw byte string
+            // literals, because they are araw and so cannot use any escapes.
+            if (c as u32) <= 0xFF && mode != Mode::RawByteStr {
                err.span_suggestion(
                    span,
                    &format!(
@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error(
                    format!("\\x{:X}", c as u32),
                    Applicability::MaybeIncorrect,
                );
-            } else if matches!(mode, Mode::Byte) {
+            } else if mode == Mode::Byte {
                err.span_label(span, "this multibyte character does not fit into a single byte");
-            } else if matches!(mode, Mode::ByteStr) {
+            } else if mode != Mode::RawByteStr {
                let mut utf8 = String::new();
                utf8.push(c);
                err.span_suggestion(
@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error(
            }
            err.emit();
        }
-        EscapeError::NonAsciiCharInByteString => {
-            assert!(mode.is_byte());
-            let (c, span) = last_char();
-            let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
-                format!(" but is {:?}", c)
-            } else {
-                String::new()
-            };
-            handler
-                .struct_span_err(span, "raw byte string must be ASCII")
-                .span_label(span, &format!("must be ASCII{}", postfix))
-                .emit();
-        }
        EscapeError::OutOfRangeHexEscape => {
            handler
                .struct_span_err(span, "out of range hex escape")
--- a/src/test/ui/attributes/key-value-non-ascii.rs
+++ b/src/test/ui/attributes/key-value-non-ascii.rs
@ -1,4 +1,4 @@
 #![feature(rustc_attrs)]

-#[rustc_dummy = b"ﬃ.rs"] //~ ERROR non-ASCII character in byte constant
+#[rustc_dummy = b"ﬃ.rs"] //~ ERROR non-ASCII character in byte string literal
 fn main() {}
--- a/src/test/ui/attributes/key-value-non-ascii.stderr
+++ b/src/test/ui/attributes/key-value-non-ascii.stderr
@ -1,8 +1,8 @@
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/key-value-non-ascii.rs:3:19
   |
 LL | #[rustc_dummy = b"ﬃ.rs"]
-   |                   ^ byte constant must be ASCII
+   |                   ^ must be ASCII
   |
 help: if you meant to use the UTF-8 encoding of 'ﬃ', use \xHH escapes
   |
--- a/src/test/ui/parser/byte-literals.rs
+++ b/src/test/ui/parser/byte-literals.rs
@ -7,6 +7,6 @@ pub fn main() {
    b'\x0Z';  //~ ERROR invalid character in numeric character escape: `Z`
    b'	';  //~ ERROR byte constant must be escaped
    b''';  //~ ERROR byte constant must be escaped
-    b'é';  //~ ERROR non-ASCII character in byte constant
+    b'é';  //~ ERROR non-ASCII character in byte literal
    b'a  //~ ERROR unterminated byte constant [E0763]
 }
--- a/src/test/ui/parser/byte-literals.stderr
+++ b/src/test/ui/parser/byte-literals.stderr
@ -32,11 +32,11 @@ error: byte constant must be escaped: `'`
 LL |     b''';
   |       ^ help: escape the character: `\'`

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
  --> $DIR/byte-literals.rs:10:7
   |
 LL |     b'é';
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
   |
 help: if you meant to use the unicode code point for 'é', use a \xHH escape
   |
--- a/src/test/ui/parser/byte-string-literals.rs
+++ b/src/test/ui/parser/byte-string-literals.rs
@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f";  //~ ERROR unknown byte escape
 pub fn main() {
    b"\f";  //~ ERROR unknown byte escape
    b"\x0Z";  //~ ERROR invalid character in numeric character escape: `Z`
-    b"é";  //~ ERROR non-ASCII character in byte constant
-    br##"é"##;  //~ ERROR raw byte string must be ASCII
+    b"é";  //~ ERROR non-ASCII character in byte string literal
+    br##"é"##;  //~ ERROR non-ASCII character in raw byte string literal
    b"a  //~ ERROR unterminated double quote byte string
 }
--- a/src/test/ui/parser/byte-string-literals.stderr
+++ b/src/test/ui/parser/byte-string-literals.stderr
@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z`
 LL |     b"\x0Z";
   |          ^ invalid character in numeric character escape

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/byte-string-literals.rs:6:7
   |
 LL |     b"é";
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
   |
 help: if you meant to use the unicode code point for 'é', use a \xHH escape
   |
 LL |     b"\xE9";
   |       ~~~~

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/byte-string-literals.rs:7:10
   |
 LL |     br##"é"##;
--- a/src/test/ui/parser/raw/raw-byte-string-literals.rs
+++ b/src/test/ui/parser/raw/raw-byte-string-literals.rs
@ -2,6 +2,6 @@

 pub fn main() {
    br"a
"; //~ ERROR bare CR not allowed in raw string
-    br"é";  //~ ERROR raw byte string must be ASCII
+    br"é";  //~ ERROR non-ASCII character in raw byte string literal
    br##~"a"~##;  //~ ERROR only `#` is allowed in raw string delimitation
 }
--- a/src/test/ui/parser/raw/raw-byte-string-literals.stderr
+++ b/src/test/ui/parser/raw/raw-byte-string-literals.stderr
@ -4,7 +4,7 @@ error: bare CR not allowed in raw string
 LL |     br"a
";
   |         ^

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/raw-byte-string-literals.rs:5:8
   |
 LL |     br"é";
--- a/src/test/ui/parser/unicode-control-codepoints.rs
+++ b/src/test/ui/parser/unicode-control-codepoints.rs
@ -14,15 +14,15 @@ fn main() {
    println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
    //~^ ERROR unicode codepoint changing visible direction of text present in literal
    println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only ");
-    //~^ ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
+    //~^ ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
    println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
-    //~^ ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
+    //~^ ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
    println!("{:?}", '‮');
    //~^ ERROR unicode codepoint changing visible direction of text present in literal
 }
--- a/src/test/ui/parser/unicode-control-codepoints.stderr
+++ b/src/test/ui/parser/unicode-control-codepoints.stderr
@ -14,69 +14,69 @@ LL |     println!("{:?}", b"us\u{202B}e\u{202A}r");
   |
   = help: unicode escape sequences cannot be used as a byte or in a byte string

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/unicode-control-codepoints.rs:16:26
   |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                          ^ byte constant must be ASCII but is '\u{202e}'
+   |                          ^ must be ASCII but is '\u{202e}'
   |
 help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
   |
 LL |     println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin  begin admins only ");
   |                          ~~~~~~~~~~~~

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/unicode-control-codepoints.rs:16:30
   |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                             ^ byte constant must be ASCII but is '\u{2066}'
+   |                             ^ must be ASCII but is '\u{2066}'
   |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
   |
 LL |     println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin  begin admins only ");
   |                             ~~~~~~~~~~~~

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/unicode-control-codepoints.rs:16:41
   |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                       ^ byte constant must be ASCII but is '\u{2069}'
+   |                                       ^ must be ASCII but is '\u{2069}'
   |
 help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
   |
 LL |     println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9  begin admins only ");
   |                                       ~~~~~~~~~~~~

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/unicode-control-codepoints.rs:16:43
   |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                        ^ byte constant must be ASCII but is '\u{2066}'
+   |                                        ^ must be ASCII but is '\u{2066}'
   |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
   |
 LL |     println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
   |                                        ~~~~~~~~~~~~

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/unicode-control-codepoints.rs:21:29
   |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
   |                             ^ must be ASCII but is '\u{202e}'

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/unicode-control-codepoints.rs:21:33
   |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
   |                                ^ must be ASCII but is '\u{2066}'

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/unicode-control-codepoints.rs:21:44
   |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
   |                                          ^ must be ASCII but is '\u{2069}'

-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
  --> $DIR/unicode-control-codepoints.rs:21:46
   |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
--- a/src/test/ui/suggestions/multibyte-escapes.rs
+++ b/src/test/ui/suggestions/multibyte-escapes.rs
@ -2,17 +2,17 @@

 fn main() {
    b'µ';
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte literal
    //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII

    b'字';
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte literal
    //~| NOTE: this multibyte character does not fit into a single byte
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII

    b"字";
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte string literal
    //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII
 }
--- a/src/test/ui/suggestions/multibyte-escapes.stderr
+++ b/src/test/ui/suggestions/multibyte-escapes.stderr
@ -1,28 +1,28 @@
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
  --> $DIR/multibyte-escapes.rs:4:7
   |
 LL |     b'µ';
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
   |
 help: if you meant to use the unicode code point for 'µ', use a \xHH escape
   |
 LL |     b'\xB5';
   |       ~~~~

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
  --> $DIR/multibyte-escapes.rs:9:7
   |
 LL |     b'字';
   |       ^^
   |       |
-   |       byte constant must be ASCII
+   |       must be ASCII
   |       this multibyte character does not fit into a single byte

-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
  --> $DIR/multibyte-escapes.rs:14:7
   |
 LL |     b"字";
-   |       ^^ byte constant must be ASCII
+   |       ^^ must be ASCII
   |
 help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
   |