Auto merge of #128200 - estebank:normalize-whitespace, r=pnkfelix

Change output normalization logic to be linear against size of output Modify the rendered output normalization routine to scan each character *once* and construct a `String` to be printed out to the terminal *once*, instead of using `String::replace` in a loop multiple times. The output doesn't change, but the time spent to prepare a diagnostic is now faster (or rather, closer to what it was before #127528).
2025-01-27 07:03:45 +00:00 · 2024-08-06 03:44:38 +00:00 · 2024-08-06 03:44:38 +00:00 · 8c7e0e1608
commit 8c7e0e1608
parent c9687a95a6 51b5bb1798
1 changed files with 38 additions and 30 deletions
--- a/compiler/rustc_errors/src/emitter.rs
+++ b/compiler/rustc_errors/src/emitter.rs
@ -2564,22 +2564,13 @@ fn num_decimal_digits(num: usize) -> usize {

 // We replace some characters so the CLI output is always consistent and underlines aligned.
 // Keep the following list in sync with `rustc_span::char_width`.
+// ATTENTION: keep lexicografically sorted so that the binary search will work
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),    // We do our own tab replacement
-    ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', "<EFBFBD>"), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', "<EFBFBD>"), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', "<EFBFBD>"), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', "<EFBFBD>"),
-    ('\u{2066}', "<EFBFBD>"),
-    ('\u{2067}', "<EFBFBD>"),
-    ('\u{2068}', "<EFBFBD>"),
-    ('\u{202C}', "<EFBFBD>"),
-    ('\u{2069}', "<EFBFBD>"),
+    // tidy-alphabetical-start
    // In terminals without Unicode support the following will be garbled, but in *all* terminals
    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
    // support" gate.
-    ('\u{0000}', "␀"),
+    ('\0', "␀"),
    ('\u{0001}', "␁"),
    ('\u{0002}', "␂"),
    ('\u{0003}', "␃"),
@ -2588,11 +2579,12 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
    ('\u{0006}', "␆"),
    ('\u{0007}', "␇"),
    ('\u{0008}', "␈"),
-    ('\u{000B}', "␋"),
-    ('\u{000C}', "␌"),
-    ('\u{000D}', "␍"),
-    ('\u{000E}', "␎"),
-    ('\u{000F}', "␏"),
+    ('\u{0009}', "    "), // We do our own tab replacement
+    ('\u{000b}', "␋"),
+    ('\u{000c}', "␌"),
+    ('\u{000d}', "␍"),
+    ('\u{000e}', "␎"),
+    ('\u{000f}', "␏"),
    ('\u{0010}', "␐"),
    ('\u{0011}', "␑"),
    ('\u{0012}', "␒"),
@ -2603,21 +2595,37 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
    ('\u{0017}', "␗"),
    ('\u{0018}', "␘"),
    ('\u{0019}', "␙"),
-    ('\u{001A}', "␚"),
-    ('\u{001B}', "␛"),
-    ('\u{001C}', "␜"),
-    ('\u{001D}', "␝"),
-    ('\u{001E}', "␞"),
-    ('\u{001F}', "␟"),
-    ('\u{007F}', "␡"),
+    ('\u{001a}', "␚"),
+    ('\u{001b}', "␛"),
+    ('\u{001c}', "␜"),
+    ('\u{001d}', "␝"),
+    ('\u{001e}', "␞"),
+    ('\u{001f}', "␟"),
+    ('\u{007f}', "␡"),
+    ('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
+    ('\u{202a}', "<EFBFBD>"), // The following unicode text flow control characters are inconsistently
+    ('\u{202b}', "<EFBFBD>"), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202c}', "<EFBFBD>"), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202d}', "<EFBFBD>"),
+    ('\u{202e}', "<EFBFBD>"),
+    ('\u{2066}', "<EFBFBD>"),
+    ('\u{2067}', "<EFBFBD>"),
+    ('\u{2068}', "<EFBFBD>"),
+    ('\u{2069}', "<EFBFBD>"),
+    // tidy-alphabetical-end
 ];

-fn normalize_whitespace(str: &str) -> String {
-    let mut s = str.to_string();
-    for (c, replacement) in OUTPUT_REPLACEMENTS {
-        s = s.replace(*c, replacement);
-    }
-    s
+fn normalize_whitespace(s: &str) -> String {
+    // Scan the input string for a character in the ordered table above. If it's present, replace
+    // it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
+    // char. At the end, allocate all chars into a string in one operation.
+    s.chars().fold(String::with_capacity(s.len()), |mut s, c| {
+        match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
+            Ok(i) => s.push_str(OUTPUT_REPLACEMENTS[i].1),
+            _ => s.push(c),
+        }
+        s
+    })
 }

 fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {