From 856b55fb34050a56f4ce2a5e171d8a7dca19729c Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 6 Dec 2023 20:36:44 +1100 Subject: [PATCH 1/9] De-pub some functions. --- compiler/rustc_lexer/src/unescape.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 717b042fbda..d98a702427e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -167,7 +167,7 @@ impl Mode { } /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. - pub fn ascii_escapes_should_be_ascii(self) -> bool { + fn ascii_escapes_should_be_ascii(self) -> bool { match self { Mode::Char | Mode::Str | Mode::RawStr => true, Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false, @@ -175,7 +175,7 @@ impl Mode { } /// Whether characters within the literal must be within the ASCII range - pub fn characters_should_be_ascii(self) -> bool { + fn characters_should_be_ascii(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, @@ -183,7 +183,7 @@ impl Mode { } /// Byte literals do not allow unicode escape. - pub fn is_unicode_escape_disallowed(self) -> bool { + fn is_unicode_escape_disallowed(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, From e290582abff724811eca6dd021f97955cb13a3b8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 6 Dec 2023 20:37:02 +1100 Subject: [PATCH 2/9] Identify impossible cases in `ascii_escapes_should_be_ascii`. Raw strings (of all kinds) don't support escapes, so this function should never be called on them. --- compiler/rustc_lexer/src/unescape.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index d98a702427e..ddbe826f570 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -169,8 +169,9 @@ impl Mode { /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. fn ascii_escapes_should_be_ascii(self) -> bool { match self { - Mode::Char | Mode::Str | Mode::RawStr => true, - Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false, + Mode::Char | Mode::Str => true, + Mode::Byte | Mode::ByteStr | Mode::CStr => false, + Mode::RawStr | Mode::RawByteStr | Mode::RawCStr => unreachable!(), } } From c6bbb376a24c1397ca13078192418ef746382b10 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 6 Dec 2023 20:39:08 +1100 Subject: [PATCH 3/9] Fix an out-of-date comment. --- compiler/rustc_lexer/src/unescape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index ddbe826f570..dab656b35f9 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -395,7 +395,7 @@ where let mut chars = src.chars(); // The `start` and `end` computation here matches the one in - // `unescape_str_or_byte_str` for consistency, even though this function + // `unescape_str_common` for consistency, even though this function // doesn't have to worry about skipping any chars. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8(); From 08b8ba0a3207b2c5e34cab7f85323908f8bd8b37 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 6 Dec 2023 20:43:59 +1100 Subject: [PATCH 4/9] Add some useful comments. --- compiler/rustc_lexer/src/unescape.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index dab656b35f9..5559c03edea 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -7,7 +7,9 @@ use std::str::Chars; #[cfg(test)] mod tests; -/// Errors and warnings that can occur during string unescaping. +/// Errors and warnings that can occur during string unescaping. They mostly +/// relate to malformed escape sequences, but there are a few that are about +/// other problems. #[derive(Debug, PartialEq, Eq)] pub enum EscapeError { /// Expected 1 char, but 0 were found. @@ -73,9 +75,11 @@ impl EscapeError { } } -/// Takes a contents of a literal (without quotes) and produces a -/// sequence of escaped characters or errors. -/// Values are returned through invoking of the provided callback. +/// Takes a contents of a literal (without quotes) and produces a sequence of +/// escaped characters or errors. +/// +/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, +/// the callback will be called exactly once. pub fn unescape_literal(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), From f883762970b4831e61b7b3355981003868946897 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Dec 2023 09:16:18 +1100 Subject: [PATCH 5/9] Remove explicit `\n` and `\t` handling in `unescape_str_common`. The fallback `_` case works for these chars, no need to treat them specially. --- compiler/rustc_lexer/src/unescape.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 5559c03edea..545c579cdb5 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -351,8 +351,6 @@ where _ => scan_escape::(&mut chars, mode), } } - '\n' => Ok(b'\n'.into()), - '\t' => Ok(b'\t'.into()), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into), From 119b1d0c63760eb309da5176f31a58f9854d6815 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Dec 2023 09:25:53 +1100 Subject: [PATCH 6/9] Eliminate `is_byte: bool` args in unescaping code. These don't really make sense since C string literals were added. This commit removes them in favour for `mode: Mode` args. `ascii_check` still has a `characters_should_be_ascii: bool` arg. Also, `characters_should_be_ascii` is renamed to be shorter. --- compiler/rustc_lexer/src/unescape.rs | 40 +++++++++++++--------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 545c579cdb5..8c5b2e3635e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -87,14 +87,12 @@ where match mode { Mode::Char | Mode::Byte => { let mut chars = src.chars(); - let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); + let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); } Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback), - Mode::RawStr | Mode::RawByteStr => { - unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) - } + Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), Mode::CStr | Mode::RawCStr => unreachable!(), } } @@ -122,11 +120,9 @@ where F: FnMut(Range, Result), { if mode == Mode::RawCStr { - unescape_raw_str_or_raw_byte_str( - src, - mode.characters_should_be_ascii(), - &mut |r, result| callback(r, result.map(CStrUnit::Char)), - ); + unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { + callback(r, result.map(CStrUnit::Char)) + }); } else { unescape_str_common(src, mode, callback); } @@ -135,13 +131,13 @@ where /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error. pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), false) + unescape_char_or_byte(&mut src.chars(), Mode::Char) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), true).map(byte_from_char) + unescape_char_or_byte(&mut src.chars(), Mode::Byte).map(byte_from_char) } /// What kind of literal do we parse. @@ -180,7 +176,8 @@ impl Mode { } /// Whether characters within the literal must be within the ASCII range - fn characters_should_be_ascii(self) -> bool { + #[inline] + fn chars_should_be_ascii(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, @@ -299,22 +296,21 @@ fn scan_unicode( } #[inline] -fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result { - if characters_should_be_ascii && !c.is_ascii() { - // Byte literal can't be a non-ascii character. +fn ascii_check(c: char, chars_should_be_ascii: bool) -> Result { + if chars_should_be_ascii && !c.is_ascii() { Err(EscapeError::NonAsciiCharInByte) } else { Ok(c) } } -fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result { +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { let c = chars.next().ok_or(EscapeError::ZeroChars)?; let res = match c { - '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }), + '\\' => scan_escape(chars, mode), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, is_byte), + _ => ascii_check(c, mode.chars_should_be_ascii()), }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); @@ -329,6 +325,7 @@ where F: FnMut(Range, Result), { let mut chars = src.chars(); + let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop // The `start` and `end` computation here is complicated because // `skip_ascii_whitespace` makes us to skip over chars without counting @@ -353,7 +350,7 @@ where } '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into), + _ => ascii_check(c, chars_should_be_ascii).map(Into::into), }; let end = src.len() - chars.as_str().len(); callback(start..end, res.map(Into::into)); @@ -390,11 +387,12 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(src: &str, is_byte: bool, callback: &mut F) +fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { let mut chars = src.chars(); + let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop // The `start` and `end` computation here matches the one in // `unescape_str_common` for consistency, even though this function @@ -403,7 +401,7 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, is_byte), + _ => ascii_check(c, chars_should_be_ascii), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); From 9741dba7fa3fd61a8851e906861ac88bc527ca4c Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Dec 2023 10:49:42 +1100 Subject: [PATCH 7/9] Tweak the `no-nuls.rs` test. The `empty!` macro calls should be outside the `cfg(FALSE)` function. --- .../rfcs/rfc-3348-c-string-literals/no-nuls.rs | Bin 594 -> 738 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs index a7e36b2233ec5478ddbc6d2f689c40d535c209ef..e20ca50b88f905d5a0332a71f64d02c061f21711 100644 GIT binary patch delta 160 zcmcb_@`!cAR7M+peT9&WRE6ZUbcN*nyyDW_)MAAapg>M$NorAIPO(B#YFd6#szP3A zjzV%qYI1gFUb;ela&l==vEJk Date: Thu, 7 Dec 2023 18:52:11 +1100 Subject: [PATCH 8/9] Remove an unnecessary `into`. --- compiler/rustc_lexer/src/unescape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 8c5b2e3635e..72cb78f82a2 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -353,7 +353,7 @@ where _ => ascii_check(c, chars_should_be_ascii).map(Into::into), }; let end = src.len() - chars.as_str().len(); - callback(start..end, res.map(Into::into)); + callback(start..end, res); } } From 0a401b624bff7cfa776bac2f767b21ffdb1bd98f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Dec 2023 11:18:09 +1100 Subject: [PATCH 9/9] Tweak `Mode`. - Add `use Mode::*` to avoid all the qualifiers. - Reorder the variants. The existing order makes no particular sense, which has bugged me for some time. I've chosen an order that makes sense to me. --- compiler/rustc_lexer/src/unescape.rs | 72 +++++++++++++++------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 72cb78f82a2..249126a269e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -4,6 +4,8 @@ use std::ops::Range; use std::str::Chars; +use Mode::*; + #[cfg(test)] mod tests; @@ -85,15 +87,14 @@ where F: FnMut(Range, Result), { match mode { - Mode::Char | Mode::Byte => { + Char | Byte => { let mut chars = src.chars(); let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); } - Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback), - - Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), - Mode::CStr | Mode::RawCStr => unreachable!(), + Str | ByteStr => unescape_str_common(src, mode, callback), + RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), + CStr | RawCStr => unreachable!(), } } @@ -119,36 +120,44 @@ pub fn unescape_c_string(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { - if mode == Mode::RawCStr { - unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { - callback(r, result.map(CStrUnit::Char)) - }); - } else { - unescape_str_common(src, mode, callback); + match mode { + CStr => { + unescape_str_common(src, mode, callback); + } + RawCStr => { + unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { + callback(r, result.map(CStrUnit::Char)) + }); + } + Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), } } /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error. pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Mode::Char) + unescape_char_or_byte(&mut src.chars(), Char) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Mode::Byte).map(byte_from_char) + unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) } /// What kind of literal do we parse. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { Char, - Str, + Byte, - ByteStr, + + Str, RawStr, + + ByteStr, RawByteStr, + CStr, RawCStr, } @@ -156,47 +165,42 @@ pub enum Mode { impl Mode { pub fn in_double_quotes(self) -> bool { match self { - Mode::Str - | Mode::ByteStr - | Mode::RawStr - | Mode::RawByteStr - | Mode::CStr - | Mode::RawCStr => true, - Mode::Char | Mode::Byte => false, + Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, + Char | Byte => false, } } /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. fn ascii_escapes_should_be_ascii(self) -> bool { match self { - Mode::Char | Mode::Str => true, - Mode::Byte | Mode::ByteStr | Mode::CStr => false, - Mode::RawStr | Mode::RawByteStr | Mode::RawCStr => unreachable!(), + Char | Str => true, + Byte | ByteStr | CStr => false, + RawStr | RawByteStr | RawCStr => unreachable!(), } } - /// Whether characters within the literal must be within the ASCII range + /// Whether characters within the literal must be within the ASCII range. #[inline] fn chars_should_be_ascii(self) -> bool { match self { - Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, - Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + Byte | ByteStr | RawByteStr => true, + Char | Str | RawStr | CStr | RawCStr => false, } } /// Byte literals do not allow unicode escape. fn is_unicode_escape_disallowed(self) -> bool { match self { - Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, - Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + Byte | ByteStr | RawByteStr => true, + Char | Str | RawStr | CStr | RawCStr => false, } } pub fn prefix_noraw(self) -> &'static str { match self { - Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b", - Mode::CStr | Mode::RawCStr => "c", - Mode::Char | Mode::Str | Mode::RawStr => "", + Char | Str | RawStr => "", + Byte | ByteStr | RawByteStr => "b", + CStr | RawCStr => "c", } } } @@ -411,7 +415,7 @@ where #[inline] pub fn byte_from_char(c: char) -> u8 { let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); + debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); res as u8 }