Split unicode case LUTs in single and multi variants

The majority of char case replacements are single char replacements, so storing them as [char; 3] wastes a lot of space. This commit splits the replacement tables for both `to_lower` and `to_upper` into two separate tables, one with single-character mappings and one with multi-character mappings. This reduces the binary size for programs using all of these tables with roughly 24K bytes.
2024-10-31 14:31:55 +00:00 · 2023-03-16 11:56:33 +01:00 · 2023-03-16 11:56:33 +01:00 · f9bd884385
commit f9bd884385
parent 8a4eb9e3a8
2 changed files with 1008 additions and 1695 deletions
--- a/library/core/src/unicode/unicode_data.rs
+++ b/library/core/src/unicode/unicode_data.rs
--- a/src/tools/unicode-table-generator/src/case_mapping.rs
+++ b/src/tools/unicode-table-generator/src/case_mapping.rs
@ -1,22 +1,47 @@
 use crate::{fmt_list, UnicodeData};
-use std::{collections::BTreeMap, fmt};
+use std::{
+    collections::BTreeMap,
+    fmt::{self, Write},
+};

 pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
    let mut file = String::new();

    file.push_str(HEADER.trim_start());
-    file.push_str(&generate_table("LOWER", &data.to_lower));
+    file.push_str(&generate_tables("LOWER", &data.to_lower));
    file.push_str("\n\n");
-    file.push_str(&generate_table("UPPER", &data.to_upper));
+    file.push_str(&generate_tables("UPPER", &data.to_upper));
    file
 }

-fn generate_table(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
-    format!(
-        "static {}CASE_TABLE: &[(char, [char; 3])] = &[{}];",
+fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
+    let (single, multi): (Vec<_>, Vec<_>) = data
+        .iter()
+        .map(to_mapping)
+        .filter(|(k, _)| !k.0.is_ascii())
+        .partition(|(_, [_, s, t])| s.0 == '\0' && t.0 == '\0');
+
+    let mut tables = String::new();
+
+    write!(
+        tables,
+        "static {}CASE_TABLE_SINGLE: &[(char, char)] = &[{}];",
        case,
-        fmt_list(data.iter().map(to_mapping).filter(|(k, _)| !k.0.is_ascii()))
+        fmt_list(single.into_iter().map(|(k, [v, _, _])| (k, v)))
    )
+    .unwrap();
+
+    tables.push_str("\n\n");
+
+    write!(
+        tables,
+        "static {}CASE_TABLE_MULTI: &[(char, [char; 3])] = &[{}];",
+        case,
+        fmt_list(multi)
+    )
+    .unwrap();
+
+    tables
 }

 fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
@ -43,9 +68,9 @@ pub fn to_lower(c: char) -> [char; 3] {
    if c.is_ascii() {
        [(c as u8).to_ascii_lowercase() as char, '\0', '\0']
    } else {
-        match bsearch_case_table(c, LOWERCASE_TABLE) {
+        match bsearch_case_tables(c, LOWERCASE_TABLE_SINGLE, LOWERCASE_TABLE_MULTI) {
+            Some(replacement) => replacement,
            None => [c, '\0', '\0'],
-            Some(index) => LOWERCASE_TABLE[index].1,
        }
    }
 }
@ -54,14 +79,21 @@ pub fn to_upper(c: char) -> [char; 3] {
    if c.is_ascii() {
        [(c as u8).to_ascii_uppercase() as char, '\0', '\0']
    } else {
-        match bsearch_case_table(c, UPPERCASE_TABLE) {
+        match bsearch_case_tables(c, UPPERCASE_TABLE_SINGLE, UPPERCASE_TABLE_MULTI) {
+            Some(replacement) => replacement,
            None => [c, '\0', '\0'],
-            Some(index) => UPPERCASE_TABLE[index].1,
        }
    }
 }

-fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
-    table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
+fn bsearch_case_tables(
+    c: char,
+    single: &[(char, char)],
+    multi: &[(char, [char; 3])],
+) -> Option<[char; 3]> {
+    match single.binary_search_by(|&(key, _)| key.cmp(&c)) {
+        Ok(i) => Some([single[i].1, '\0', '\0']),
+        Err(_) => multi.binary_search_by(|&(key, _)| key.cmp(&c)).map(|i| multi[i].1).ok(),
+    }
 }
 ";