Auto merge of #113476 - fee1-dead-contrib:c-str-lit, r=petrochenkov

Reimplement C-str literals This reverts #113334, cc `@fmease.` While converting lexer tokens to ast Tokens in `rustc_parse`, we check the edition of the span of the token. If the edition < 2021, we split the token into two, one being the identifier and other being the str literal.
2024-11-25 16:24:46 +00:00 · 2023-07-25 12:04:34 +00:00 · 2023-07-25 12:04:34 +00:00 · 23405bb123
commit 23405bb123
parent ff8fe76c0e a0376e9ec2
12 changed files with 85 additions and 97 deletions
--- a/compiler/rustc_lexer/src/cursor.rs
+++ b/compiler/rustc_lexer/src/cursor.rs
@ -24,6 +24,10 @@ impl<'a> Cursor<'a> {
        }
    }

+    pub fn as_str(&self) -> &'a str {
+        self.chars.as_str()
+    }
+
    /// Returns the last eaten symbol (or `'\0'` in release builds).
    /// (For debug assertions only.)
    pub(crate) fn prev(&self) -> char {
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -367,6 +367,13 @@ impl Cursor<'_> {
                Some(|terminated| Byte { terminated }),
            ),

+            // c-string literal, raw c-string literal or identifier.
+            'c' => self.c_or_byte_string(
+                |terminated| CStr { terminated },
+                |n_hashes| RawCStr { n_hashes },
+                None,
+            ),
+
            // Identifier (this should be checked after other variant that can
            // start as identifier).
            c if is_id_start(c) => self.ident_or_unknown_prefix(),
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -9,8 +9,8 @@ use rustc_ast::tokenstream::TokenStream;
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
 use rustc_lexer::unescape::{self, EscapeError, Mode};
-use rustc_lexer::Cursor;
 use rustc_lexer::{Base, DocStyle, RawStrError};
+use rustc_lexer::{Cursor, LiteralKind};
 use rustc_session::lint::builtin::{
    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
 };
@ -118,6 +118,7 @@ impl<'a> StringReader<'a> {
        let mut swallow_next_invalid = 0;
        // Skip trivial (whitespace & comments) tokens
        loop {
+            let str_before = self.cursor.as_str();
            let token = self.cursor.advance_token();
            let start = self.pos;
            self.pos = self.pos + BytePos(token.len);
@ -165,10 +166,7 @@ impl<'a> StringReader<'a> {
                    continue;
                }
                rustc_lexer::TokenKind::Ident => {
-                    let sym = nfc_normalize(self.str_from(start));
-                    let span = self.mk_sp(start, self.pos);
-                    self.sess.symbol_gallery.insert(sym, span);
-                    token::Ident(sym, false)
+                    self.ident(start)
                }
                rustc_lexer::TokenKind::RawIdent => {
                    let sym = nfc_normalize(self.str_from(start + BytePos(2)));
@ -182,10 +180,7 @@ impl<'a> StringReader<'a> {
                }
                rustc_lexer::TokenKind::UnknownPrefix => {
                    self.report_unknown_prefix(start);
-                    let sym = nfc_normalize(self.str_from(start));
-                    let span = self.mk_sp(start, self.pos);
-                    self.sess.symbol_gallery.insert(sym, span);
-                    token::Ident(sym, false)
+                    self.ident(start)
                }
                rustc_lexer::TokenKind::InvalidIdent
                    // Do not recover an identifier with emoji if the codepoint is a confusable
@ -203,6 +198,27 @@ impl<'a> StringReader<'a> {
                        .push(span);
                    token::Ident(sym, false)
                }
+                // split up (raw) c string literals to an ident and a string literal when edition < 2021.
+                rustc_lexer::TokenKind::Literal {
+                    kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
+                    suffix_start: _,
+                } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
+                    let prefix_len = match kind {
+                        LiteralKind::CStr { .. } => 1,
+                        LiteralKind::RawCStr { .. } => 2,
+                        _ => unreachable!(),
+                    };
+
+                    // reset the state so that only the prefix ("c" or "cr")
+                    // was consumed.
+                    let lit_start = start + BytePos(prefix_len);
+                    self.pos = lit_start;
+                    self.cursor = Cursor::new(&str_before[prefix_len as usize..]);
+
+                    self.report_unknown_prefix(start);
+                    let prefix_span = self.mk_sp(start, lit_start);
+                    return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
+                }
                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                    let suffix_start = start + BytePos(suffix_start);
                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@ -317,6 +333,13 @@ impl<'a> StringReader<'a> {
        }
    }

+    fn ident(&self, start: BytePos) -> TokenKind {
+        let sym = nfc_normalize(self.str_from(start));
+        let span = self.mk_sp(start, self.pos);
+        self.sess.symbol_gallery.insert(sym, span);
+        token::Ident(sym, false)
+    }
+
    fn struct_fatal_span_char(
        &self,
        from_pos: BytePos,
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs
@ -0,0 +1,14 @@
+// force-host
+// edition: 2018
+// no-prefer-dynamic
+#![crate_type = "proc-macro"]
+
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use std::str::FromStr;
+
+#[proc_macro]
+pub fn number_of_tokens(_: TokenStream) -> TokenStream {
+    TokenStream::from_str("c\"\"").unwrap().into_iter().count().to_string().parse().unwrap()
+}
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs
@ -1,5 +1,4 @@
-// FIXME(c_str_literals): This should be `run-pass`
-// known-bug: #113333
+// run-pass
 // edition: 2021

 #![feature(c_str_literals)]
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr
@ -1,25 +0,0 @@
-error: prefix `c` is unknown
-  --> $DIR/basic.rs:8:27
-   |
-LL |     assert_eq!(b"test\0", c"test".to_bytes_with_nul());
-   |                           ^ unknown prefix
-   |
-   = note: prefixed identifiers and literals are reserved since Rust 2021
-help: consider inserting whitespace here
-   |
-LL |     assert_eq!(b"test\0", c "test".to_bytes_with_nul());
-   |                            +
-
-error: no rules expected the token `"test"`
-  --> $DIR/basic.rs:8:28
-   |
-LL |     assert_eq!(b"test\0", c"test".to_bytes_with_nul());
-   |                            -^^^^^
-   |                            |
-   |                            no rules expected this token in macro call
-   |                            help: missing comma here
-   |
-   = note: while trying to match sequence start
-
-error: aborting due to 2 previous errors
-
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs
@ -0,0 +1,16 @@
+// even if this crate is edition 2021, proc macros compiled using older
+// editions should still be able to observe the pre-2021 token behavior
+//
+// adapted from tests/ui/rust-2021/reserved-prefixes-via-macro.rs
+
+// edition: 2021
+// check-pass
+
+// aux-build: count.rs
+extern crate count;
+
+const _: () = {
+    assert!(count::number_of_tokens!() == 2);
+};
+
+fn main() {}
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr
@ -1,32 +1,21 @@
-error: prefix `c` is unknown
+error[E0658]: `c".."` literals are experimental
  --> $DIR/gate.rs:10:5
   |
 LL |     c"foo";
-   |     ^ unknown prefix
+   |     ^^^^^^
   |
-   = note: prefixed identifiers and literals are reserved since Rust 2021
-help: consider inserting whitespace here
-   |
-LL |     c "foo";
-   |      +
+   = note: see issue #105723 <https://github.com/rust-lang/rust/issues/105723> for more information
+   = help: add `#![feature(c_str_literals)]` to the crate attributes to enable

-error: prefix `c` is unknown
+error[E0658]: `c".."` literals are experimental
  --> $DIR/gate.rs:13:8
   |
 LL |     m!(c"test");
-   |        ^ unknown prefix
+   |        ^^^^^^^
   |
-   = note: prefixed identifiers and literals are reserved since Rust 2021
-help: consider inserting whitespace here
-   |
-LL |     m!(c "test");
-   |         +
+   = note: see issue #105723 <https://github.com/rust-lang/rust/issues/105723> for more information
+   = help: add `#![feature(c_str_literals)]` to the crate attributes to enable

-error: expected one of `!`, `.`, `::`, `;`, `?`, `{`, `}`, or an operator, found `"foo"`
-  --> $DIR/gate.rs:10:6
-   |
-LL |     c"foo";
-   |      ^^^^^ expected one of 8 possible tokens
-
-error: aborting due to 3 previous errors
+error: aborting due to 2 previous errors

+For more information about this error, try `rustc --explain E0658`.
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs
@ -1,5 +1,4 @@
-// FIXME(c_str_literals): This should be `run-pass`
-// known-bug: #113333
+// run-pass
 // edition: 2021

 #![feature(c_str_literals)]
--- a/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.stderr
+++ b/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.stderr
@ -1,38 +0,0 @@
-error: prefix `c` is unknown
-  --> $DIR/non-ascii.rs:9:9
-   |
-LL |         c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(),
-   |         ^ unknown prefix
-   |
-   = note: prefixed identifiers and literals are reserved since Rust 2021
-help: consider inserting whitespace here
-   |
-LL |         c "\xEF\x80🦀\u{1F980}".to_bytes_with_nul(),
-   |          +
-
-error: out of range hex escape
-  --> $DIR/non-ascii.rs:9:11
-   |
-LL |         c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(),
-   |           ^^^^ must be a character in the range [\x00-\x7f]
-
-error: out of range hex escape
-  --> $DIR/non-ascii.rs:9:15
-   |
-LL |         c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(),
-   |               ^^^^ must be a character in the range [\x00-\x7f]
-
-error: no rules expected the token `"\xEF\x80🦀\u{1F980}"`
-  --> $DIR/non-ascii.rs:9:10
-   |
-LL |         c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(),
-   |          -^^^^^^^^^^^^^^^^^^^^
-   |          |
-   |          no rules expected this token in macro call
-   |          help: missing comma here
-   |
-note: while trying to match `,`
-  --> $SRC_DIR/core/src/macros/mod.rs:LL:COL
-
-error: aborting due to 4 previous errors
-