From 7dbf2c0ed86a6fc97aa0b93bc2ac865d6f2cc438 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 15:17:37 +1100 Subject: [PATCH] Make non-ASCII errors more consistent. There are three kinds of "byte" literals: byte literals, byte string literals, and raw byte string literals. None are allowed to have non-ASCII chars in them. Two `EscapeError` variants exist for when that constraint is violated. - `NonAsciiCharInByte`: used for byte literals and byte string literals. - `NonAsciiCharInByteString`: used for raw byte string literals. As a result, the messages for raw byte string literals use different wording, without good reason. Also, byte string literals are incorrectly described as "byte constants" in some error messages. This commit eliminates `NonAsciiCharInByteString` so the three cases are handled similarly, and described correctly. The `mode` is enough to distinguish them. Note: Some existing error messages mention "byte constants" and some mention "byte literals". I went with the latter here, because it's a more correct name, as used by the Reference. --- compiler/rustc_lexer/src/unescape.rs | 7 ++-- compiler/rustc_lexer/src/unescape/tests.rs | 7 ++-- .../src/lexer/unescape_error_reporting.rs | 32 ++++++++----------- src/test/ui/attributes/key-value-non-ascii.rs | 2 +- .../ui/attributes/key-value-non-ascii.stderr | 4 +-- src/test/ui/parser/byte-literals.rs | 2 +- src/test/ui/parser/byte-literals.stderr | 4 +-- src/test/ui/parser/byte-string-literals.rs | 4 +-- .../ui/parser/byte-string-literals.stderr | 6 ++-- .../ui/parser/raw/raw-byte-string-literals.rs | 2 +- .../raw/raw-byte-string-literals.stderr | 2 +- .../ui/parser/unicode-control-codepoints.rs | 16 +++++----- .../parser/unicode-control-codepoints.stderr | 24 +++++++------- src/test/ui/suggestions/multibyte-escapes.rs | 12 +++---- .../ui/suggestions/multibyte-escapes.stderr | 12 +++---- 15 files changed, 62 insertions(+), 74 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index f0042a397c2..9c9cce7cbd4 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -52,10 +52,8 @@ pub enum EscapeError { /// Unicode escape code in byte literal. UnicodeEscapeInByte, - /// Non-ascii character in byte literal. + /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. NonAsciiCharInByte, - /// Non-ascii character in byte string literal. - NonAsciiCharInByteString, /// After a line ending with '\', the next line contains whitespace /// characters that are not skipped. @@ -349,8 +347,7 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let result = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - c => Ok(c), + _ => ascii_check(c, is_byte), }; let end = src.len() - chars.as_str().len(); callback(start..end, result); diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index fa61554afde..008edef5a63 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -289,9 +289,6 @@ fn test_unescape_raw_byte_str() { } check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); - check( - "🦀a", - &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], - ); + check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]); } diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index 055ee98a00a..6373f5b4fd6 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error( .emit(); } EscapeError::NonAsciiCharInByte => { - assert!(mode.is_byte()); let (c, span) = last_char(); - let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); + let desc = match mode { + Mode::Byte => "byte literal", + Mode::ByteStr => "byte string literal", + Mode::RawByteStr => "raw byte string literal", + _ => panic!("non-is_byte literal paired with NonAsciiCharInByte"), + }; + let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc)); let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { format!(" but is {:?}", c) } else { String::new() }; - err.span_label(span, &format!("byte constant must be ASCII{}", postfix)); - if (c as u32) <= 0xFF { + err.span_label(span, &format!("must be ASCII{}", postfix)); + // Note: the \\xHH suggestions are not given for raw byte string + // literals, because they are araw and so cannot use any escapes. + if (c as u32) <= 0xFF && mode != Mode::RawByteStr { err.span_suggestion( span, &format!( @@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error( format!("\\x{:X}", c as u32), Applicability::MaybeIncorrect, ); - } else if matches!(mode, Mode::Byte) { + } else if mode == Mode::Byte { err.span_label(span, "this multibyte character does not fit into a single byte"); - } else if matches!(mode, Mode::ByteStr) { + } else if mode != Mode::RawByteStr { let mut utf8 = String::new(); utf8.push(c); err.span_suggestion( @@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error( } err.emit(); } - EscapeError::NonAsciiCharInByteString => { - assert!(mode.is_byte()); - let (c, span) = last_char(); - let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { - format!(" but is {:?}", c) - } else { - String::new() - }; - handler - .struct_span_err(span, "raw byte string must be ASCII") - .span_label(span, &format!("must be ASCII{}", postfix)) - .emit(); - } EscapeError::OutOfRangeHexEscape => { handler .struct_span_err(span, "out of range hex escape") diff --git a/src/test/ui/attributes/key-value-non-ascii.rs b/src/test/ui/attributes/key-value-non-ascii.rs index 12942eabdf7..e14e2fc05ad 100644 --- a/src/test/ui/attributes/key-value-non-ascii.rs +++ b/src/test/ui/attributes/key-value-non-ascii.rs @@ -1,4 +1,4 @@ #![feature(rustc_attrs)] -#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte constant +#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal fn main() {} diff --git a/src/test/ui/attributes/key-value-non-ascii.stderr b/src/test/ui/attributes/key-value-non-ascii.stderr index 422107867f7..23d482de6a8 100644 --- a/src/test/ui/attributes/key-value-non-ascii.stderr +++ b/src/test/ui/attributes/key-value-non-ascii.stderr @@ -1,8 +1,8 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/key-value-non-ascii.rs:3:19 | LL | #[rustc_dummy = b"ffi.rs"] - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes | diff --git a/src/test/ui/parser/byte-literals.rs b/src/test/ui/parser/byte-literals.rs index 05a510b24a7..896dc1a1a5f 100644 --- a/src/test/ui/parser/byte-literals.rs +++ b/src/test/ui/parser/byte-literals.rs @@ -7,6 +7,6 @@ pub fn main() { b'\x0Z'; //~ ERROR invalid character in numeric character escape: `Z` b' '; //~ ERROR byte constant must be escaped b'''; //~ ERROR byte constant must be escaped - b'é'; //~ ERROR non-ASCII character in byte constant + b'é'; //~ ERROR non-ASCII character in byte literal b'a //~ ERROR unterminated byte constant [E0763] } diff --git a/src/test/ui/parser/byte-literals.stderr b/src/test/ui/parser/byte-literals.stderr index c3d00061630..efa55ae05bd 100644 --- a/src/test/ui/parser/byte-literals.stderr +++ b/src/test/ui/parser/byte-literals.stderr @@ -32,11 +32,11 @@ error: byte constant must be escaped: `'` LL | b'''; | ^ help: escape the character: `\'` -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/byte-literals.rs:10:7 | LL | b'é'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | diff --git a/src/test/ui/parser/byte-string-literals.rs b/src/test/ui/parser/byte-string-literals.rs index b1f11024a7b..30a4f50c4e4 100644 --- a/src/test/ui/parser/byte-string-literals.rs +++ b/src/test/ui/parser/byte-string-literals.rs @@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape pub fn main() { b"\f"; //~ ERROR unknown byte escape b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` - b"é"; //~ ERROR non-ASCII character in byte constant - br##"é"##; //~ ERROR raw byte string must be ASCII + b"é"; //~ ERROR non-ASCII character in byte string literal + br##"é"##; //~ ERROR non-ASCII character in raw byte string literal b"a //~ ERROR unterminated double quote byte string } diff --git a/src/test/ui/parser/byte-string-literals.stderr b/src/test/ui/parser/byte-string-literals.stderr index 3b8b3692e05..5b96cc3d18a 100644 --- a/src/test/ui/parser/byte-string-literals.stderr +++ b/src/test/ui/parser/byte-string-literals.stderr @@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z` LL | b"\x0Z"; | ^ invalid character in numeric character escape -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/byte-string-literals.rs:6:7 | LL | b"é"; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | LL | b"\xE9"; | ~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/byte-string-literals.rs:7:10 | LL | br##"é"##; diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.rs b/src/test/ui/parser/raw/raw-byte-string-literals.rs index 163c8ac66b0..1b859fee596 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.rs +++ b/src/test/ui/parser/raw/raw-byte-string-literals.rs @@ -2,6 +2,6 @@ pub fn main() { br"a "; //~ ERROR bare CR not allowed in raw string - br"é"; //~ ERROR raw byte string must be ASCII + br"é"; //~ ERROR non-ASCII character in raw byte string literal br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.stderr b/src/test/ui/parser/raw/raw-byte-string-literals.stderr index cfc877104bd..a2f27d1ed70 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw/raw-byte-string-literals.stderr @@ -4,7 +4,7 @@ error: bare CR not allowed in raw string LL | br"a "; | ^ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/raw-byte-string-literals.rs:5:8 | LL | br"é"; diff --git a/src/test/ui/parser/unicode-control-codepoints.rs b/src/test/ui/parser/unicode-control-codepoints.rs index 5af0b585a12..df099bb62ad 100644 --- a/src/test/ui/parser/unicode-control-codepoints.rs +++ b/src/test/ui/parser/unicode-control-codepoints.rs @@ -14,15 +14,15 @@ fn main() { println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); //~^ ERROR unicode codepoint changing visible direction of text present in literal println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); - //~^ ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant + //~^ ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); - //~^ ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII + //~^ ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal println!("{:?}", '‮'); //~^ ERROR unicode codepoint changing visible direction of text present in literal } diff --git a/src/test/ui/parser/unicode-control-codepoints.stderr b/src/test/ui/parser/unicode-control-codepoints.stderr index 44548c72ff5..fc071a94191 100644 --- a/src/test/ui/parser/unicode-control-codepoints.stderr +++ b/src/test/ui/parser/unicode-control-codepoints.stderr @@ -14,69 +14,69 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); | = help: unicode escape sequences cannot be used as a byte or in a byte string -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:26 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{202e}' + | ^ must be ASCII but is '\u{202e}' | help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes | LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:30 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:41 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2069}' + | ^ must be ASCII but is '\u{2069}' | help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:43 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); | ~~~~~~~~~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:29 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{202e}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:33 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2066}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:44 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2069}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:46 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); diff --git a/src/test/ui/suggestions/multibyte-escapes.rs b/src/test/ui/suggestions/multibyte-escapes.rs index fd5d46a4e92..c4105186244 100644 --- a/src/test/ui/suggestions/multibyte-escapes.rs +++ b/src/test/ui/suggestions/multibyte-escapes.rs @@ -2,17 +2,17 @@ fn main() { b'µ'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b'字'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| NOTE: this multibyte character does not fit into a single byte - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b"字"; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte string literal //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII } diff --git a/src/test/ui/suggestions/multibyte-escapes.stderr b/src/test/ui/suggestions/multibyte-escapes.stderr index 6e26bc1f01c..1e7c43e6538 100644 --- a/src/test/ui/suggestions/multibyte-escapes.stderr +++ b/src/test/ui/suggestions/multibyte-escapes.stderr @@ -1,28 +1,28 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:4:7 | LL | b'µ'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'µ', use a \xHH escape | LL | b'\xB5'; | ~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:9:7 | LL | b'字'; | ^^ | | - | byte constant must be ASCII + | must be ASCII | this multibyte character does not fit into a single byte -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/multibyte-escapes.rs:14:7 | LL | b"字"; - | ^^ byte constant must be ASCII + | ^^ must be ASCII | help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes |