From 856b55fb34050a56f4ce2a5e171d8a7dca19729c Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 6 Dec 2023 20:36:44 +1100
Subject: [PATCH 1/9] De-pub some functions.

---
 compiler/rustc_lexer/src/unescape.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 717b042fbda..d98a702427e 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -167,7 +167,7 @@ impl Mode {
     }
 
     /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
-    pub fn ascii_escapes_should_be_ascii(self) -> bool {
+    fn ascii_escapes_should_be_ascii(self) -> bool {
         match self {
             Mode::Char | Mode::Str | Mode::RawStr => true,
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
@@ -175,7 +175,7 @@ impl Mode {
     }
 
     /// Whether characters within the literal must be within the ASCII range
-    pub fn characters_should_be_ascii(self) -> bool {
+    fn characters_should_be_ascii(self) -> bool {
         match self {
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
             Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
@@ -183,7 +183,7 @@ impl Mode {
     }
 
     /// Byte literals do not allow unicode escape.
-    pub fn is_unicode_escape_disallowed(self) -> bool {
+    fn is_unicode_escape_disallowed(self) -> bool {
         match self {
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
             Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,

From e290582abff724811eca6dd021f97955cb13a3b8 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 6 Dec 2023 20:37:02 +1100
Subject: [PATCH 2/9] Identify impossible cases in
 `ascii_escapes_should_be_ascii`.

Raw strings (of all kinds) don't support escapes, so this function
should never be called on them.
---
 compiler/rustc_lexer/src/unescape.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index d98a702427e..ddbe826f570 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -169,8 +169,9 @@ impl Mode {
     /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
     fn ascii_escapes_should_be_ascii(self) -> bool {
         match self {
-            Mode::Char | Mode::Str | Mode::RawStr => true,
-            Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
+            Mode::Char | Mode::Str => true,
+            Mode::Byte | Mode::ByteStr | Mode::CStr => false,
+            Mode::RawStr | Mode::RawByteStr | Mode::RawCStr => unreachable!(),
         }
     }
 

From c6bbb376a24c1397ca13078192418ef746382b10 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 6 Dec 2023 20:39:08 +1100
Subject: [PATCH 3/9] Fix an out-of-date comment.

---
 compiler/rustc_lexer/src/unescape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index ddbe826f570..dab656b35f9 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -395,7 +395,7 @@ where
     let mut chars = src.chars();
 
     // The `start` and `end` computation here matches the one in
-    // `unescape_str_or_byte_str` for consistency, even though this function
+    // `unescape_str_common` for consistency, even though this function
     // doesn't have to worry about skipping any chars.
     while let Some(c) = chars.next() {
         let start = src.len() - chars.as_str().len() - c.len_utf8();

From 08b8ba0a3207b2c5e34cab7f85323908f8bd8b37 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 6 Dec 2023 20:43:59 +1100
Subject: [PATCH 4/9] Add some useful comments.

---
 compiler/rustc_lexer/src/unescape.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index dab656b35f9..5559c03edea 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -7,7 +7,9 @@ use std::str::Chars;
 #[cfg(test)]
 mod tests;
 
-/// Errors and warnings that can occur during string unescaping.
+/// Errors and warnings that can occur during string unescaping. They mostly
+/// relate to malformed escape sequences, but there are a few that are about
+/// other problems.
 #[derive(Debug, PartialEq, Eq)]
 pub enum EscapeError {
     /// Expected 1 char, but 0 were found.
@@ -73,9 +75,11 @@ impl EscapeError {
     }
 }
 
-/// Takes a contents of a literal (without quotes) and produces a
-/// sequence of escaped characters or errors.
-/// Values are returned through invoking of the provided callback.
+/// Takes a contents of a literal (without quotes) and produces a sequence of
+/// escaped characters or errors.
+///
+/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
+/// the callback will be called exactly once.
 pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),

From f883762970b4831e61b7b3355981003868946897 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 7 Dec 2023 09:16:18 +1100
Subject: [PATCH 5/9] Remove explicit `\n` and `\t` handling in
 `unescape_str_common`.

The fallback `_` case works for these chars, no need to treat them
specially.
---
 compiler/rustc_lexer/src/unescape.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 5559c03edea..545c579cdb5 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -351,8 +351,6 @@ where
                     _ => scan_escape::<T>(&mut chars, mode),
                 }
             }
-            '\n' => Ok(b'\n'.into()),
-            '\t' => Ok(b'\t'.into()),
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
             _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),

From 119b1d0c63760eb309da5176f31a58f9854d6815 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 7 Dec 2023 09:25:53 +1100
Subject: [PATCH 6/9] Eliminate `is_byte: bool` args in unescaping code.

These don't really make sense since C string literals were added. This
commit removes them in favour for `mode: Mode` args. `ascii_check` still
has a `characters_should_be_ascii: bool` arg.

Also, `characters_should_be_ascii` is renamed to be shorter.
---
 compiler/rustc_lexer/src/unescape.rs | 40 +++++++++++++---------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 545c579cdb5..8c5b2e3635e 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -87,14 +87,12 @@ where
     match mode {
         Mode::Char | Mode::Byte => {
             let mut chars = src.chars();
-            let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
+            let res = unescape_char_or_byte(&mut chars, mode);
             callback(0..(src.len() - chars.as_str().len()), res);
         }
         Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
 
-        Mode::RawStr | Mode::RawByteStr => {
-            unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
-        }
+        Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
         Mode::CStr | Mode::RawCStr => unreachable!(),
     }
 }
@@ -122,11 +120,9 @@ where
     F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
 {
     if mode == Mode::RawCStr {
-        unescape_raw_str_or_raw_byte_str(
-            src,
-            mode.characters_should_be_ascii(),
-            &mut |r, result| callback(r, result.map(CStrUnit::Char)),
-        );
+        unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
+            callback(r, result.map(CStrUnit::Char))
+        });
     } else {
         unescape_str_common(src, mode, callback);
     }
@@ -135,13 +131,13 @@ where
 /// Takes a contents of a char literal (without quotes), and returns an
 /// unescaped char or an error.
 pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), false)
+    unescape_char_or_byte(&mut src.chars(), Mode::Char)
 }
 
 /// Takes a contents of a byte literal (without quotes), and returns an
 /// unescaped byte or an error.
 pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), true).map(byte_from_char)
+    unescape_char_or_byte(&mut src.chars(), Mode::Byte).map(byte_from_char)
 }
 
 /// What kind of literal do we parse.
@@ -180,7 +176,8 @@ impl Mode {
     }
 
     /// Whether characters within the literal must be within the ASCII range
-    fn characters_should_be_ascii(self) -> bool {
+    #[inline]
+    fn chars_should_be_ascii(self) -> bool {
         match self {
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
             Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
@@ -299,22 +296,21 @@ fn scan_unicode(
 }
 
 #[inline]
-fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> {
-    if characters_should_be_ascii && !c.is_ascii() {
-        // Byte literal can't be a non-ascii character.
+fn ascii_check(c: char, chars_should_be_ascii: bool) -> Result<char, EscapeError> {
+    if chars_should_be_ascii && !c.is_ascii() {
         Err(EscapeError::NonAsciiCharInByte)
     } else {
         Ok(c)
     }
 }
 
-fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
+fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
     let c = chars.next().ok_or(EscapeError::ZeroChars)?;
     let res = match c {
-        '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }),
+        '\\' => scan_escape(chars, mode),
         '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
         '\r' => Err(EscapeError::BareCarriageReturn),
-        _ => ascii_check(c, is_byte),
+        _ => ascii_check(c, mode.chars_should_be_ascii()),
     }?;
     if chars.next().is_some() {
         return Err(EscapeError::MoreThanOneChar);
@@ -329,6 +325,7 @@ where
     F: FnMut(Range<usize>, Result<T, EscapeError>),
 {
     let mut chars = src.chars();
+    let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
 
     // The `start` and `end` computation here is complicated because
     // `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -353,7 +350,7 @@ where
             }
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),
+            _ => ascii_check(c, chars_should_be_ascii).map(Into::into),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, res.map(Into::into));
@@ -390,11 +387,12 @@ where
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only produce errors on bare CR.
-fn unescape_raw_str_or_raw_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
+fn unescape_raw_str_or_raw_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
     let mut chars = src.chars();
+    let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
 
     // The `start` and `end` computation here matches the one in
     // `unescape_str_common` for consistency, even though this function
@@ -403,7 +401,7 @@ where
         let start = src.len() - chars.as_str().len() - c.len_utf8();
         let res = match c {
             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            _ => ascii_check(c, is_byte),
+            _ => ascii_check(c, chars_should_be_ascii),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, res);

From 9741dba7fa3fd61a8851e906861ac88bc527ca4c Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 7 Dec 2023 10:49:42 +1100
Subject: [PATCH 7/9] Tweak the `no-nuls.rs` test.

The `empty!` macro calls should be outside the `cfg(FALSE)` function.
---
 .../rfcs/rfc-3348-c-string-literals/no-nuls.rs  | Bin 594 -> 738 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs
index a7e36b2233ec5478ddbc6d2f689c40d535c209ef..e20ca50b88f905d5a0332a71f64d02c061f21711 100644
GIT binary patch
delta 160
zcmcb_@`!cAR7M+peT9&WRE6ZUbcN*nyyDW_)MAAapg>M$NorAIPO(B#YFd6#szP3A
zjzV%qYI1gFUb;ela&l==vEJk<jN!qxTwG8yaubt_@<}&^D=kl<B(=CCJ~g+Xq*6mu
Kp?dOb#&iG;0y)zF

delta 15
XcmaFFdWmJjRL04d7{ex4Go=FnHVOte


From adc46e5c08df7d3fe68293ee94add19b653b20aa Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 7 Dec 2023 18:52:11 +1100
Subject: [PATCH 8/9] Remove an unnecessary `into`.

---
 compiler/rustc_lexer/src/unescape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 8c5b2e3635e..72cb78f82a2 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -353,7 +353,7 @@ where
             _ => ascii_check(c, chars_should_be_ascii).map(Into::into),
         };
         let end = src.len() - chars.as_str().len();
-        callback(start..end, res.map(Into::into));
+        callback(start..end, res);
     }
 }
 

From 0a401b624bff7cfa776bac2f767b21ffdb1bd98f Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 7 Dec 2023 11:18:09 +1100
Subject: [PATCH 9/9] Tweak `Mode`.

- Add `use Mode::*` to avoid all the qualifiers.
- Reorder the variants. The existing order makes no particular sense,
  which has bugged me for some time. I've chosen an order that makes
  sense to me.
---
 compiler/rustc_lexer/src/unescape.rs | 72 +++++++++++++++-------------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 72cb78f82a2..249126a269e 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -4,6 +4,8 @@
 use std::ops::Range;
 use std::str::Chars;
 
+use Mode::*;
+
 #[cfg(test)]
 mod tests;
 
@@ -85,15 +87,14 @@ where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
     match mode {
-        Mode::Char | Mode::Byte => {
+        Char | Byte => {
             let mut chars = src.chars();
             let res = unescape_char_or_byte(&mut chars, mode);
             callback(0..(src.len() - chars.as_str().len()), res);
         }
-        Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
-
-        Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
-        Mode::CStr | Mode::RawCStr => unreachable!(),
+        Str | ByteStr => unescape_str_common(src, mode, callback),
+        RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
+        CStr | RawCStr => unreachable!(),
     }
 }
 
@@ -119,36 +120,44 @@ pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
 {
-    if mode == Mode::RawCStr {
-        unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
-            callback(r, result.map(CStrUnit::Char))
-        });
-    } else {
-        unescape_str_common(src, mode, callback);
+    match mode {
+        CStr => {
+            unescape_str_common(src, mode, callback);
+        }
+        RawCStr => {
+            unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
+                callback(r, result.map(CStrUnit::Char))
+            });
+        }
+        Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
     }
 }
 
 /// Takes a contents of a char literal (without quotes), and returns an
 /// unescaped char or an error.
 pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Mode::Char)
+    unescape_char_or_byte(&mut src.chars(), Char)
 }
 
 /// Takes a contents of a byte literal (without quotes), and returns an
 /// unescaped byte or an error.
 pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Mode::Byte).map(byte_from_char)
+    unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
 }
 
 /// What kind of literal do we parse.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
     Char,
-    Str,
+
     Byte,
-    ByteStr,
+
+    Str,
     RawStr,
+
+    ByteStr,
     RawByteStr,
+
     CStr,
     RawCStr,
 }
@@ -156,47 +165,42 @@ pub enum Mode {
 impl Mode {
     pub fn in_double_quotes(self) -> bool {
         match self {
-            Mode::Str
-            | Mode::ByteStr
-            | Mode::RawStr
-            | Mode::RawByteStr
-            | Mode::CStr
-            | Mode::RawCStr => true,
-            Mode::Char | Mode::Byte => false,
+            Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
+            Char | Byte => false,
         }
     }
 
     /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
     fn ascii_escapes_should_be_ascii(self) -> bool {
         match self {
-            Mode::Char | Mode::Str => true,
-            Mode::Byte | Mode::ByteStr | Mode::CStr => false,
-            Mode::RawStr | Mode::RawByteStr | Mode::RawCStr => unreachable!(),
+            Char | Str => true,
+            Byte | ByteStr | CStr => false,
+            RawStr | RawByteStr | RawCStr => unreachable!(),
         }
     }
 
-    /// Whether characters within the literal must be within the ASCII range
+    /// Whether characters within the literal must be within the ASCII range.
     #[inline]
     fn chars_should_be_ascii(self) -> bool {
         match self {
-            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
-            Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+            Byte | ByteStr | RawByteStr => true,
+            Char | Str | RawStr | CStr | RawCStr => false,
         }
     }
 
     /// Byte literals do not allow unicode escape.
     fn is_unicode_escape_disallowed(self) -> bool {
         match self {
-            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
-            Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+            Byte | ByteStr | RawByteStr => true,
+            Char | Str | RawStr | CStr | RawCStr => false,
         }
     }
 
     pub fn prefix_noraw(self) -> &'static str {
         match self {
-            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
-            Mode::CStr | Mode::RawCStr => "c",
-            Mode::Char | Mode::Str | Mode::RawStr => "",
+            Char | Str | RawStr => "",
+            Byte | ByteStr | RawByteStr => "b",
+            CStr | RawCStr => "c",
         }
     }
 }
@@ -411,7 +415,7 @@ where
 #[inline]
 pub fn byte_from_char(c: char) -> u8 {
     let res = c as u32;
-    debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
+    debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
     res as u8
 }