From 4bcd646abb83d04f90522b89c734ee310a683507 Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Mon, 7 Apr 2025 01:10:59 +0200 Subject: [PATCH 1/2] Make `core::unicode::printable` more readable. --- library/core/src/unicode/printable.py | 48 +++++++-------- library/core/src/unicode/printable.rs | 84 +++++++++------------------ 2 files changed, 49 insertions(+), 83 deletions(-) diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index 260fa9f9e6a..b21ad42067f 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -178,8 +178,8 @@ def main(): else: normal0.append((a, b - a)) - singletons0u, singletons0l = compress_singletons(singletons0) - singletons1u, singletons1l = compress_singletons(singletons1) + SINGLETONS0_UPPER, SINGLETONS0_LOWER = compress_singletons(singletons0) + SINGLETONS1_UPPER, SINGLETONS1_LOWER = compress_singletons(singletons1) normal0 = compress_normal(normal0) normal1 = compress_normal(normal1) @@ -187,21 +187,21 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { +fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { + let x_upper = (x >> 8) as u8; + let mut lower_start = 0; + for &(upper, lower_count) in singletons_upper { + let lower_end = lower_start + lower_count as usize; + if x_upper == upper { + for &lower in &singletons_lower[lower_start..lower_end] { if lower == x as u8 { return false; } } - } else if xupper < upper { + } else if x_upper < upper { break; } - lowerstart = lowerend; + lower_start = lower_end; } let mut x = x as i32; @@ -226,30 +226,22 @@ pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else {\ + match x { + ..32 => false, // ASCII fast path + ..127 => true, // ASCII fast path + ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), + ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1),\ """) for a, b in extra: - print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) - print(" return false;") - print(" }") + print(" 0x{:x}..0x{:x} => false,".format(a, a + b)) print("""\ - true + _ => true, } }\ """) print() - print_singletons(singletons0u, singletons0l, "SINGLETONS0U", "SINGLETONS0L") - print_singletons(singletons1u, singletons1l, "SINGLETONS1U", "SINGLETONS1L") + print_singletons(SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER") + print_singletons(SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER") print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index d8fb50e4ed2..be0295a13c2 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,21 +1,21 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { +fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { + let x_upper = (x >> 8) as u8; + let mut lower_start = 0; + for &(upper, lower_count) in singletons_upper { + let lower_end = lower_start + lower_count as usize; + if x_upper == upper { + for &lower in &singletons_lower[lower_start..lower_end] { if lower == x as u8 { return false; } } - } else if xupper < upper { + } else if x_upper < upper { break; } - lowerstart = lowerend; + lower_start = lower_end; } let mut x = x as i32; @@ -40,53 +40,27 @@ pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else { - if 0x2a6e0 <= x && x < 0x2a700 { - return false; - } - if 0x2b73a <= x && x < 0x2b740 { - return false; - } - if 0x2b81e <= x && x < 0x2b820 { - return false; - } - if 0x2cea2 <= x && x < 0x2ceb0 { - return false; - } - if 0x2ebe1 <= x && x < 0x2ebf0 { - return false; - } - if 0x2ee5e <= x && x < 0x2f800 { - return false; - } - if 0x2fa1e <= x && x < 0x30000 { - return false; - } - if 0x3134b <= x && x < 0x31350 { - return false; - } - if 0x323b0 <= x && x < 0xe0100 { - return false; - } - if 0xe01f0 <= x && x < 0x110000 { - return false; - } - true + match x { + ..32 => false, // ASCII fast path + ..127 => true, // ASCII fast path + ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), + ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1), + 0x2a6e0..0x2a700 => false, + 0x2b73a..0x2b740 => false, + 0x2b81e..0x2b820 => false, + 0x2cea2..0x2ceb0 => false, + 0x2ebe1..0x2ebf0 => false, + 0x2ee5e..0x2f800 => false, + 0x2fa1e..0x30000 => false, + 0x3134b..0x31350 => false, + 0x323b0..0xe0100 => false, + 0xe01f0..0x110000 => false, + _ => true, } } #[rustfmt::skip] -const SINGLETONS0U: &[(u8, u8)] = &[ +const SINGLETONS0_UPPER: &[(u8, u8)] = &[ (0x00, 1), (0x03, 5), (0x05, 6), @@ -129,7 +103,7 @@ const SINGLETONS0U: &[(u8, u8)] = &[ (0xff, 9), ]; #[rustfmt::skip] -const SINGLETONS0L: &[u8] = &[ +const SINGLETONS0_LOWER: &[u8] = &[ 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90, 0x1c, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, 0x5c, @@ -169,7 +143,7 @@ const SINGLETONS0L: &[u8] = &[ 0xfe, 0xff, ]; #[rustfmt::skip] -const SINGLETONS1U: &[(u8, u8)] = &[ +const SINGLETONS1_UPPER: &[(u8, u8)] = &[ (0x00, 6), (0x01, 1), (0x03, 1), @@ -216,7 +190,7 @@ const SINGLETONS1U: &[(u8, u8)] = &[ (0xfb, 1), ]; #[rustfmt::skip] -const SINGLETONS1L: &[u8] = &[ +const SINGLETONS1_LOWER: &[u8] = &[ 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x7b, 0x8b, 0x93, 0x96, 0xa2, 0xb2, 0xba, 0x86, 0xb1, 0x06, 0x07, 0x09, 0x36, 0x3d, From 9109550e4ce1b920f97689ef9e3758397fd0cd3b Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Mon, 7 Apr 2025 01:16:30 +0200 Subject: [PATCH 2/2] Optimize `core::unicode::printable`. --- library/core/src/unicode/printable.py | 75 +++++++++++++++++++++++---- library/core/src/unicode/printable.rs | 67 ++++++++++++++++++++---- 2 files changed, 122 insertions(+), 20 deletions(-) diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index b21ad42067f..8ea82ded9b3 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -187,14 +187,28 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { - let x_upper = (x >> 8) as u8; +/// # Safety +/// +/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be +/// equal to the length of `singletons_lower`. +/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big +/// endian, with the highest bit set and the length contained in the remaining 15 bits. +unsafe fn check( + x: u16, + singletons_upper: &[(u8, u8)], + singletons_lower: &[u8], + normal: &[u8], +) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); let mut lower_start = 0; for &(upper, lower_count) in singletons_upper { let lower_end = lower_start + lower_count as usize; - if x_upper == upper { - for &lower in &singletons_lower[lower_start..lower_end] { - if lower == x as u8 { + if upper == x_upper { + // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` + // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be + // less than `singletons_lower.len()`. + for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { + if lower == x_lower { return false; } } @@ -209,9 +223,14 @@ fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: let mut current = true; while let Some(v) = normal.next() { let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 + let upper = v & 0x7f; + // SAFETY: The encoding of `normal` is guaranteed by the caller such that + // if the length is greater than 0x7f, it consists of two bytes, so there + // must be a next byte. + let lower = unsafe { normal.next().unwrap_unchecked() }; + i32::from(u16::from_be_bytes([upper, lower])) } else { - v as i32 + i32::from(v) }; x -= len; if x < 0 { @@ -229,8 +248,38 @@ pub(crate) fn is_printable(x: char) -> bool { match x { ..32 => false, // ASCII fast path ..127 => true, // ASCII fast path - ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), - ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1),\ + ..0x10000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS0_UPPER.len() { + lower_count_total += SINGLETONS0_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS0_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal + // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } + } + ..0x20000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS1_UPPER.len() { + lower_count_total += SINGLETONS1_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS1_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal + // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } + }\ """) for a, b in extra: print(" 0x{:x}..0x{:x} => false,".format(a, a + b)) @@ -240,8 +289,12 @@ pub(crate) fn is_printable(x: char) -> bool { }\ """) print() - print_singletons(SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER") - print_singletons(SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER") + print_singletons( + SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER" + ) + print_singletons( + SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER" + ) print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index be0295a13c2..8cd891670cf 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,14 +1,28 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { - let x_upper = (x >> 8) as u8; +/// # Safety +/// +/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be +/// equal to the length of `singletons_lower`. +/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big +/// endian, with the highest bit set and the length contained in the remaining 15 bits. +unsafe fn check( + x: u16, + singletons_upper: &[(u8, u8)], + singletons_lower: &[u8], + normal: &[u8], +) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); let mut lower_start = 0; for &(upper, lower_count) in singletons_upper { let lower_end = lower_start + lower_count as usize; - if x_upper == upper { - for &lower in &singletons_lower[lower_start..lower_end] { - if lower == x as u8 { + if upper == x_upper { + // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` + // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be + // less than `singletons_lower.len()`. + for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { + if lower == x_lower { return false; } } @@ -23,9 +37,14 @@ fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: let mut current = true; while let Some(v) = normal.next() { let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 + let upper = v & 0x7f; + // SAFETY: The encoding of `normal` is guaranteed by the caller such that + // if the length is greater than 0x7f, it consists of two bytes, so there + // must be a next byte. + let lower = unsafe { normal.next().unwrap_unchecked() }; + i32::from(u16::from_be_bytes([upper, lower])) } else { - v as i32 + i32::from(v) }; x -= len; if x < 0 { @@ -43,8 +62,38 @@ pub(crate) fn is_printable(x: char) -> bool { match x { ..32 => false, // ASCII fast path ..127 => true, // ASCII fast path - ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), - ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1), + ..0x10000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS0_UPPER.len() { + lower_count_total += SINGLETONS0_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS0_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal + // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } + } + ..0x20000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS1_UPPER.len() { + lower_count_total += SINGLETONS1_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS1_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal + // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } + } 0x2a6e0..0x2a700 => false, 0x2b73a..0x2b740 => false, 0x2b81e..0x2b820 => false,