From 7dbf2c0ed86a6fc97aa0b93bc2ac865d6f2cc438 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 3 Nov 2022 15:17:37 +1100
Subject: [PATCH] Make non-ASCII errors more consistent.

There are three kinds of "byte" literals: byte literals, byte string
literals, and raw byte string literals. None are allowed to have
non-ASCII chars in them.

Two `EscapeError` variants exist for when that constraint is violated.
- `NonAsciiCharInByte`: used for byte literals and byte string literals.
- `NonAsciiCharInByteString`: used for raw byte string literals.

As a result, the messages for raw byte string literals use different
wording, without good reason. Also, byte string literals are incorrectly
described as "byte constants" in some error messages.

This commit eliminates `NonAsciiCharInByteString` so the three cases are
handled similarly, and described correctly. The `mode` is enough to
distinguish them.

Note: Some existing error messages mention "byte constants" and some
mention "byte literals". I went with the latter here, because it's a
more correct name, as used by the Reference.
---
 compiler/rustc_lexer/src/unescape.rs          |  7 ++--
 compiler/rustc_lexer/src/unescape/tests.rs    |  7 ++--
 .../src/lexer/unescape_error_reporting.rs     | 32 ++++++++-----------
 src/test/ui/attributes/key-value-non-ascii.rs |  2 +-
 .../ui/attributes/key-value-non-ascii.stderr  |  4 +--
 src/test/ui/parser/byte-literals.rs           |  2 +-
 src/test/ui/parser/byte-literals.stderr       |  4 +--
 src/test/ui/parser/byte-string-literals.rs    |  4 +--
 .../ui/parser/byte-string-literals.stderr     |  6 ++--
 .../ui/parser/raw/raw-byte-string-literals.rs |  2 +-
 .../raw/raw-byte-string-literals.stderr       |  2 +-
 .../ui/parser/unicode-control-codepoints.rs   | 16 +++++-----
 .../parser/unicode-control-codepoints.stderr  | 24 +++++++-------
 src/test/ui/suggestions/multibyte-escapes.rs  | 12 +++----
 .../ui/suggestions/multibyte-escapes.stderr   | 12 +++----
 15 files changed, 62 insertions(+), 74 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index f0042a397c2..9c9cce7cbd4 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -52,10 +52,8 @@ pub enum EscapeError {
 
     /// Unicode escape code in byte literal.
     UnicodeEscapeInByte,
-    /// Non-ascii character in byte literal.
+    /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
     NonAsciiCharInByte,
-    /// Non-ascii character in byte string literal.
-    NonAsciiCharInByteString,
 
     /// After a line ending with '\', the next line contains whitespace
     /// characters that are not skipped.
@@ -349,8 +347,7 @@ where
         let start = src.len() - chars.as_str().len() - c.len_utf8();
         let result = match c {
             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString),
-            c => Ok(c),
+            _ => ascii_check(c, is_byte),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, result);
diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs
index fa61554afde..008edef5a63 100644
--- a/compiler/rustc_lexer/src/unescape/tests.rs
+++ b/compiler/rustc_lexer/src/unescape/tests.rs
@@ -289,9 +289,6 @@ fn test_unescape_raw_byte_str() {
     }
 
     check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
-    check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
-    check(
-        "🦀a",
-        &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))],
-    );
+    check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
+    check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]);
 }
diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
index 055ee98a00a..6373f5b4fd6 100644
--- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
+++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error(
                 .emit();
         }
         EscapeError::NonAsciiCharInByte => {
-            assert!(mode.is_byte());
             let (c, span) = last_char();
-            let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant");
+            let desc = match mode {
+                Mode::Byte => "byte literal",
+                Mode::ByteStr => "byte string literal",
+                Mode::RawByteStr => "raw byte string literal",
+                _ => panic!("non-is_byte literal paired with NonAsciiCharInByte"),
+            };
+            let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc));
             let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
                 format!(" but is {:?}", c)
             } else {
                 String::new()
             };
-            err.span_label(span, &format!("byte constant must be ASCII{}", postfix));
-            if (c as u32) <= 0xFF {
+            err.span_label(span, &format!("must be ASCII{}", postfix));
+            // Note: the \\xHH suggestions are not given for raw byte string
+            // literals, because they are araw and so cannot use any escapes.
+            if (c as u32) <= 0xFF && mode != Mode::RawByteStr {
                 err.span_suggestion(
                     span,
                     &format!(
@@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error(
                     format!("\\x{:X}", c as u32),
                     Applicability::MaybeIncorrect,
                 );
-            } else if matches!(mode, Mode::Byte) {
+            } else if mode == Mode::Byte {
                 err.span_label(span, "this multibyte character does not fit into a single byte");
-            } else if matches!(mode, Mode::ByteStr) {
+            } else if mode != Mode::RawByteStr {
                 let mut utf8 = String::new();
                 utf8.push(c);
                 err.span_suggestion(
@@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error(
             }
             err.emit();
         }
-        EscapeError::NonAsciiCharInByteString => {
-            assert!(mode.is_byte());
-            let (c, span) = last_char();
-            let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
-                format!(" but is {:?}", c)
-            } else {
-                String::new()
-            };
-            handler
-                .struct_span_err(span, "raw byte string must be ASCII")
-                .span_label(span, &format!("must be ASCII{}", postfix))
-                .emit();
-        }
         EscapeError::OutOfRangeHexEscape => {
             handler
                 .struct_span_err(span, "out of range hex escape")
diff --git a/src/test/ui/attributes/key-value-non-ascii.rs b/src/test/ui/attributes/key-value-non-ascii.rs
index 12942eabdf7..e14e2fc05ad 100644
--- a/src/test/ui/attributes/key-value-non-ascii.rs
+++ b/src/test/ui/attributes/key-value-non-ascii.rs
@@ -1,4 +1,4 @@
 #![feature(rustc_attrs)]
 
-#[rustc_dummy = b"ﬃ.rs"] //~ ERROR non-ASCII character in byte constant
+#[rustc_dummy = b"ﬃ.rs"] //~ ERROR non-ASCII character in byte string literal
 fn main() {}
diff --git a/src/test/ui/attributes/key-value-non-ascii.stderr b/src/test/ui/attributes/key-value-non-ascii.stderr
index 422107867f7..23d482de6a8 100644
--- a/src/test/ui/attributes/key-value-non-ascii.stderr
+++ b/src/test/ui/attributes/key-value-non-ascii.stderr
@@ -1,8 +1,8 @@
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/key-value-non-ascii.rs:3:19
    |
 LL | #[rustc_dummy = b"ﬃ.rs"]
-   |                   ^ byte constant must be ASCII
+   |                   ^ must be ASCII
    |
 help: if you meant to use the UTF-8 encoding of 'ﬃ', use \xHH escapes
    |
diff --git a/src/test/ui/parser/byte-literals.rs b/src/test/ui/parser/byte-literals.rs
index 05a510b24a7..896dc1a1a5f 100644
--- a/src/test/ui/parser/byte-literals.rs
+++ b/src/test/ui/parser/byte-literals.rs
@@ -7,6 +7,6 @@ pub fn main() {
     b'\x0Z';  //~ ERROR invalid character in numeric character escape: `Z`
     b'	';  //~ ERROR byte constant must be escaped
     b''';  //~ ERROR byte constant must be escaped
-    b'é';  //~ ERROR non-ASCII character in byte constant
+    b'é';  //~ ERROR non-ASCII character in byte literal
     b'a  //~ ERROR unterminated byte constant [E0763]
 }
diff --git a/src/test/ui/parser/byte-literals.stderr b/src/test/ui/parser/byte-literals.stderr
index c3d00061630..efa55ae05bd 100644
--- a/src/test/ui/parser/byte-literals.stderr
+++ b/src/test/ui/parser/byte-literals.stderr
@@ -32,11 +32,11 @@ error: byte constant must be escaped: `'`
 LL |     b''';
    |       ^ help: escape the character: `\'`
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
   --> $DIR/byte-literals.rs:10:7
    |
 LL |     b'é';
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
    |
 help: if you meant to use the unicode code point for 'é', use a \xHH escape
    |
diff --git a/src/test/ui/parser/byte-string-literals.rs b/src/test/ui/parser/byte-string-literals.rs
index b1f11024a7b..30a4f50c4e4 100644
--- a/src/test/ui/parser/byte-string-literals.rs
+++ b/src/test/ui/parser/byte-string-literals.rs
@@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f";  //~ ERROR unknown byte escape
 pub fn main() {
     b"\f";  //~ ERROR unknown byte escape
     b"\x0Z";  //~ ERROR invalid character in numeric character escape: `Z`
-    b"é";  //~ ERROR non-ASCII character in byte constant
-    br##"é"##;  //~ ERROR raw byte string must be ASCII
+    b"é";  //~ ERROR non-ASCII character in byte string literal
+    br##"é"##;  //~ ERROR non-ASCII character in raw byte string literal
     b"a  //~ ERROR unterminated double quote byte string
 }
diff --git a/src/test/ui/parser/byte-string-literals.stderr b/src/test/ui/parser/byte-string-literals.stderr
index 3b8b3692e05..5b96cc3d18a 100644
--- a/src/test/ui/parser/byte-string-literals.stderr
+++ b/src/test/ui/parser/byte-string-literals.stderr
@@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z`
 LL |     b"\x0Z";
    |          ^ invalid character in numeric character escape
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/byte-string-literals.rs:6:7
    |
 LL |     b"é";
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
    |
 help: if you meant to use the unicode code point for 'é', use a \xHH escape
    |
 LL |     b"\xE9";
    |       ~~~~
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/byte-string-literals.rs:7:10
    |
 LL |     br##"é"##;
diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.rs b/src/test/ui/parser/raw/raw-byte-string-literals.rs
index 163c8ac66b0..1b859fee596 100644
--- a/src/test/ui/parser/raw/raw-byte-string-literals.rs
+++ b/src/test/ui/parser/raw/raw-byte-string-literals.rs
@@ -2,6 +2,6 @@
 
 pub fn main() {
     br"a"; //~ ERROR bare CR not allowed in raw string
-    br"é";  //~ ERROR raw byte string must be ASCII
+    br"é";  //~ ERROR non-ASCII character in raw byte string literal
     br##~"a"~##;  //~ ERROR only `#` is allowed in raw string delimitation
 }
diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.stderr b/src/test/ui/parser/raw/raw-byte-string-literals.stderr
index cfc877104bd..a2f27d1ed70 100644
--- a/src/test/ui/parser/raw/raw-byte-string-literals.stderr
+++ b/src/test/ui/parser/raw/raw-byte-string-literals.stderr
@@ -4,7 +4,7 @@ error: bare CR not allowed in raw string
 LL |     br"a";
    |         ^
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/raw-byte-string-literals.rs:5:8
    |
 LL |     br"é";
diff --git a/src/test/ui/parser/unicode-control-codepoints.rs b/src/test/ui/parser/unicode-control-codepoints.rs
index 5af0b585a12..df099bb62ad 100644
--- a/src/test/ui/parser/unicode-control-codepoints.rs
+++ b/src/test/ui/parser/unicode-control-codepoints.rs
@@ -14,15 +14,15 @@ fn main() {
     println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
     //~^ ERROR unicode codepoint changing visible direction of text present in literal
     println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only ");
-    //~^ ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
-    //~| ERROR non-ASCII character in byte constant
+    //~^ ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
+    //~| ERROR non-ASCII character in byte string literal
     println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##);
-    //~^ ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
-    //~| ERROR raw byte string must be ASCII
+    //~^ ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
+    //~| ERROR non-ASCII character in raw byte string literal
     println!("{:?}", '‮');
     //~^ ERROR unicode codepoint changing visible direction of text present in literal
 }
diff --git a/src/test/ui/parser/unicode-control-codepoints.stderr b/src/test/ui/parser/unicode-control-codepoints.stderr
index 44548c72ff5..fc071a94191 100644
--- a/src/test/ui/parser/unicode-control-codepoints.stderr
+++ b/src/test/ui/parser/unicode-control-codepoints.stderr
@@ -14,69 +14,69 @@ LL |     println!("{:?}", b"us\u{202B}e\u{202A}r");
    |
    = help: unicode escape sequences cannot be used as a byte or in a byte string
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:26
    |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                          ^ byte constant must be ASCII but is '\u{202e}'
+   |                          ^ must be ASCII but is '\u{202e}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
    |
 LL |     println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin  begin admins only ");
    |                          ~~~~~~~~~~~~
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:30
    |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                             ^ byte constant must be ASCII but is '\u{2066}'
+   |                             ^ must be ASCII but is '\u{2066}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
    |
 LL |     println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin  begin admins only ");
    |                             ~~~~~~~~~~~~
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:41
    |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                       ^ byte constant must be ASCII but is '\u{2069}'
+   |                                       ^ must be ASCII but is '\u{2069}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
    |
 LL |     println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9  begin admins only ");
    |                                       ~~~~~~~~~~~~
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:43
    |
 LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                        ^ byte constant must be ASCII but is '\u{2066}'
+   |                                        ^ must be ASCII but is '\u{2066}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
    |
 LL |     println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
    |                                        ~~~~~~~~~~~~
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:29
    |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
    |                             ^ must be ASCII but is '\u{202e}'
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:33
    |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
    |                                ^ must be ASCII but is '\u{2066}'
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:44
    |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
    |                                          ^ must be ASCII but is '\u{2069}'
 
-error: raw byte string must be ASCII
+error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:46
    |
 LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
diff --git a/src/test/ui/suggestions/multibyte-escapes.rs b/src/test/ui/suggestions/multibyte-escapes.rs
index fd5d46a4e92..c4105186244 100644
--- a/src/test/ui/suggestions/multibyte-escapes.rs
+++ b/src/test/ui/suggestions/multibyte-escapes.rs
@@ -2,17 +2,17 @@
 
 fn main() {
     b'µ';
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte literal
     //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII
 
     b'字';
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte literal
     //~| NOTE: this multibyte character does not fit into a single byte
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII
 
     b"字";
-    //~^ ERROR: non-ASCII character in byte constant
+    //~^ ERROR: non-ASCII character in byte string literal
     //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
-    //~| NOTE: byte constant must be ASCII
+    //~| NOTE: must be ASCII
 }
diff --git a/src/test/ui/suggestions/multibyte-escapes.stderr b/src/test/ui/suggestions/multibyte-escapes.stderr
index 6e26bc1f01c..1e7c43e6538 100644
--- a/src/test/ui/suggestions/multibyte-escapes.stderr
+++ b/src/test/ui/suggestions/multibyte-escapes.stderr
@@ -1,28 +1,28 @@
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
   --> $DIR/multibyte-escapes.rs:4:7
    |
 LL |     b'µ';
-   |       ^ byte constant must be ASCII
+   |       ^ must be ASCII
    |
 help: if you meant to use the unicode code point for 'µ', use a \xHH escape
    |
 LL |     b'\xB5';
    |       ~~~~
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte literal
   --> $DIR/multibyte-escapes.rs:9:7
    |
 LL |     b'字';
    |       ^^
    |       |
-   |       byte constant must be ASCII
+   |       must be ASCII
    |       this multibyte character does not fit into a single byte
 
-error: non-ASCII character in byte constant
+error: non-ASCII character in byte string literal
   --> $DIR/multibyte-escapes.rs:14:7
    |
 LL |     b"字";
-   |       ^^ byte constant must be ASCII
+   |       ^^ must be ASCII
    |
 help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
    |