Add skip list based implementation for smaller encoding

This arranges for the sparser sets (everything except lower and uppercase) to be encoded in a significantly smaller context. However, it is also a performance trade-off (roughly 3x slower than the bitset encoding). The 40% size reduction is deemed to be sufficiently important to merit this performance loss, particularly as it is unlikely that this code is hot anywhere (and if it is, paying the memory cost for a bitset that directly represents the data seems worthwhile). Alphabetic : 1599 bytes (- 937 bytes) Case_Ignorable : 949 bytes (- 822 bytes) Cased : 359 bytes (- 429 bytes) Cc : 9 bytes (- 15 bytes) Grapheme_Extend: 813 bytes (- 675 bytes) Lowercase : 863 bytes N : 419 bytes (- 619 bytes) Uppercase : 776 bytes White_Space : 37 bytes (- 46 bytes) Total table sizes: 5824 bytes (-3543 bytes)
2024-12-01 19:23:50 +00:00 · 2020-03-25 21:00:01 -04:00 · 2020-03-25 21:00:01 -04:00 · 9c1ceece20
commit 9c1ceece20
parent 33b9e6f5cf
5 changed files with 466 additions and 819 deletions
--- a/src/libcore/unicode/unicode_data.rs
+++ b/src/libcore/unicode/unicode_data.rs
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@ -4,6 +4,7 @@ use ucd_parse::Codepoints;

 mod case_mapping;
 mod raw_emitter;
+mod skiplist;
 mod unicode_download;

 use raw_emitter::{emit_codepoints, RawEmitter};
@ -172,13 +173,14 @@ fn main() {

        modules.push((property.to_lowercase().to_string(), emitter.file));
        println!(
-            "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})",
+            "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
            property,
            emitter.bytes_used,
            datapoints,
            ranges.len(),
            ranges.first().unwrap().start,
-            ranges.last().unwrap().end
+            ranges.last().unwrap().end,
+            emitter.desc,
        );
        total_bytes += emitter.bytes_used;
    }
@ -259,6 +261,7 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
    let mut s = String::new();
    s.push_str("#![allow(incomplete_features, unused)]\n");
    s.push_str("#![feature(const_generics)]\n\n");
+    s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
    s.push_str(&format!("#[path = \"{}\"]\n", data_path));
    s.push_str("mod unicode_data;\n\n");

@ -267,7 +270,8 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
    for (property, ranges) in ranges {
        s.push_str(&format!(r#"    println!("Testing {}");"#, property));
        s.push('\n');
-        s.push_str(&format!("    {}();\n", property.to_lowercase()));
+        s.push_str(&format!("    {}_true();\n", property.to_lowercase()));
+        s.push_str(&format!("    {}_false();\n", property.to_lowercase()));
        let mut is_true = Vec::new();
        let mut is_false = Vec::new();
        for ch_num in 0..(std::char::MAX as u32) {
@ -281,8 +285,10 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
            }
        }

-        s.push_str(&format!("    fn {}() {{\n", property.to_lowercase()));
+        s.push_str(&format!("    fn {}_true() {{\n", property.to_lowercase()));
        generate_asserts(&mut s, property, &is_true, true);
+        s.push_str("    }\n\n");
+        s.push_str(&format!("    fn {}_false() {{\n", property.to_lowercase()));
        generate_asserts(&mut s, property, &is_false, false);
        s.push_str("    }\n\n");
    }
@ -295,19 +301,19 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool
    for range in ranges_from_set(points) {
        if range.end == range.start + 1 {
            s.push_str(&format!(
-                "        assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n",
+                "        assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
                if truthy { "" } else { "!" },
                property.to_lowercase(),
-                range.start,
                std::char::from_u32(range.start).unwrap(),
-        ));
+                range.start,
+            ));
        } else {
            s.push_str(&format!("        for chn in {:?}u32 {{\n", range));
            s.push_str(&format!(
                "            assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
                if truthy { "" } else { "!" },
                property.to_lowercase(),
-        ));
+            ));
            s.push_str("        }\n");
        }
    }
@ -323,17 +329,25 @@ fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
    loop {
        let mut new_ranges = Vec::new();
        let mut idx_iter = 0..(ranges.len() - 1);
+        let mut should_insert_last = true;
        while let Some(idx) = idx_iter.next() {
            let cur = ranges[idx].clone();
            let next = ranges[idx + 1].clone();
            if cur.end == next.start {
-                let _ = idx_iter.next(); // skip next as we're merging it in
+                if idx_iter.next().is_none() {
+                    // We're merging the last element
+                    should_insert_last = false;
+                }
                new_ranges.push(cur.start..next.end);
            } else {
+                // We're *not* merging the last element
+                should_insert_last = true;
                new_ranges.push(cur);
            }
        }
-        new_ranges.push(ranges.last().unwrap().clone());
+        if should_insert_last {
+            new_ranges.push(ranges.last().unwrap().clone());
+        }
        if new_ranges.len() == ranges.len() {
            *ranges = new_ranges;
            break;
@ -341,4 +355,12 @@ fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
            *ranges = new_ranges;
        }
    }
+
+    let mut last_end = None;
+    for range in ranges {
+        if let Some(last) = last_end {
+            assert!(range.start > last, "{:?}", range);
+        }
+        last_end = Some(range.end);
+    }
 }
--- a/src/tools/unicode-table-generator/src/range_search.rs
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@ -1,5 +1,5 @@
 #[inline(always)]
-fn range_search<
+fn bitset_search<
    const N: usize,
    const CHUNK_SIZE: usize,
    const N1: usize,
@ -47,3 +47,52 @@ fn range_search<
    };
    (word & (1 << (needle % 64) as u64)) != 0
 }
+
+fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
+    short_offset_run_header & ((1 << 21) - 1)
+}
+
+fn decode_length(short_offset_run_header: u32) -> usize {
+    (short_offset_run_header >> 21) as usize
+}
+
+#[inline(always)]
+fn skip_search<const SOR: usize, const OFFSETS: usize>(
+    needle: u32,
+    short_offset_runs: &[u32; SOR],
+    offsets: &[u8; OFFSETS],
+) -> bool {
+    // Note that this *cannot* be past the end of the array, as the last
+    // element is greater than std::char::MAX (the largest possible needle).
+    //
+    // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
+    // location cannot be past it, so Err(idx) != length either.
+    //
+    // This means that we can avoid bounds checking for the accesses below, too.
+    let last_idx =
+        match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
+            Ok(idx) => idx + 1,
+            Err(idx) => idx,
+        };
+
+    let mut offset_idx = decode_length(short_offset_runs[last_idx]);
+    let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
+        decode_length(*next) - offset_idx
+    } else {
+        offsets.len() - offset_idx
+    };
+    let prev =
+        last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0);
+
+    let total = needle - prev;
+    let mut prefix_sum = 0;
+    for _ in 0..(length - 1) {
+        let offset = offsets[offset_idx];
+        prefix_sum += offset as u32;
+        if prefix_sum > total {
+            break;
+        }
+        offset_idx += 1;
+    }
+    offset_idx % 2 == 1
+}
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@ -46,12 +46,13 @@ use std::ops::Range;
 #[derive(Clone)]
 pub struct RawEmitter {
    pub file: String,
+    pub desc: String,
    pub bytes_used: usize,
 }

 impl RawEmitter {
    pub fn new() -> RawEmitter {
-        RawEmitter { file: String::new(), bytes_used: 0 }
+        RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
    }

    fn blank_line(&mut self) {
@ -61,8 +62,21 @@ impl RawEmitter {
        writeln!(&mut self.file, "").unwrap();
    }

-    fn emit_bitset(&mut self, words: &[u64]) {
-        let mut words = words.to_vec();
+    fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
+        let last_code_point = ranges.last().unwrap().end;
+        // bitset for every bit in the codepoint range
+        //
+        // + 2 to ensure an all zero word to use for padding
+        let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
+        for range in ranges {
+            for codepoint in range.clone() {
+                let bucket = codepoint as usize / 64;
+                let bit = codepoint as u64 % 64;
+                buckets[bucket] |= 1 << bit;
+            }
+        }
+
+        let mut words = buckets;
        // Ensure that there's a zero word in the dataset, used for padding and
        // such.
        words.push(0);
@ -118,6 +132,19 @@ impl RawEmitter {
        // We only need it for the words that we removed by applying a shift and
        // flip to them.
        self.bytes_used += 2 * canonicalized.canonicalized_words.len();
+
+        self.blank_line();
+
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
+        writeln!(&mut self.file, "        c as u32,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
+        writeln!(&mut self.file, "        BITSET_LAST_CHUNK_MAP,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
+        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
    }

    fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
@ -184,40 +211,24 @@ impl RawEmitter {
        .unwrap();
        self.bytes_used += chunk_length * chunks.len();
    }
-
-    pub fn emit_lookup(&mut self) {
-        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
-        writeln!(&mut self.file, "    super::range_search(",).unwrap();
-        writeln!(&mut self.file, "        c as u32,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
-        writeln!(&mut self.file, "        BITSET_LAST_CHUNK_MAP,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
-        writeln!(&mut self.file, "    )").unwrap();
-        writeln!(&mut self.file, "}}").unwrap();
-    }
 }

 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
    emitter.blank_line();

-    let last_code_point = ranges.last().unwrap().end;
-    // bitset for every bit in the codepoint range
-    //
-    // + 2 to ensure an all zero word to use for padding
-    let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
-    for range in ranges {
-        for codepoint in range.clone() {
-            let bucket = codepoint as usize / 64;
-            let bit = codepoint as u64 % 64;
-            buckets[bucket] |= 1 << bit;
-        }
-    }
+    let mut bitset = emitter.clone();
+    bitset.emit_bitset(&ranges);

-    emitter.emit_bitset(&buckets);
-    emitter.blank_line();
-    emitter.emit_lookup();
+    let mut skiplist = emitter.clone();
+    skiplist.emit_skiplist(&ranges);
+
+    if bitset.bytes_used <= skiplist.bytes_used {
+        *emitter = bitset;
+        emitter.desc = format!("bitset");
+    } else {
+        *emitter = skiplist;
+        emitter.desc = format!("skiplist");
+    }
 }

 struct Canonicalized {
--- a/src/tools/unicode-table-generator/src/skiplist.rs
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@ -0,0 +1,98 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::convert::TryInto;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+/// This will get packed into a single u32 before inserting into the data set.
+#[derive(Debug, PartialEq)]
+struct ShortOffsetRunHeader {
+    /// Note, we only allow for 21 bits here.
+    prefix_sum: u32,
+
+    /// Note, we actually only allow for 11 bits here. This should be enough --
+    /// our largest sets are around ~1400 offsets long.
+    start_idx: u16,
+}
+
+impl ShortOffsetRunHeader {
+    fn pack(&self) -> u32 {
+        assert!(self.start_idx < (1 << 11));
+        assert!(self.prefix_sum < (1 << 21));
+
+        (self.start_idx as u32) << 21 | self.prefix_sum
+    }
+}
+
+impl RawEmitter {
+    pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
+        let mut offsets = Vec::<u32>::new();
+        let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
+        let mut offset = 0;
+        for pt in points {
+            let delta = pt - offset;
+            offsets.push(delta);
+            offset = pt;
+        }
+        // Guaranteed to terminate, as it's impossible to subtract a value this
+        // large from a valid char.
+        offsets.push(std::char::MAX as u32 + 1);
+        let mut coded_offsets: Vec<u8> = Vec::new();
+        let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![];
+        let mut iter = offsets.iter().cloned();
+        let mut prefix_sum = 0;
+        loop {
+            let mut any_elements = false;
+            let mut inserted = false;
+            let start = coded_offsets.len();
+            for offset in iter.by_ref() {
+                any_elements = true;
+                prefix_sum += offset;
+                if let Ok(offset) = offset.try_into() {
+                    coded_offsets.push(offset);
+                } else {
+                    short_offset_runs.push(ShortOffsetRunHeader {
+                        start_idx: start.try_into().unwrap(),
+                        prefix_sum,
+                    });
+                    // This is just needed to maintain indices even/odd
+                    // correctly.
+                    coded_offsets.push(0);
+                    inserted = true;
+                    break;
+                }
+            }
+            if !any_elements {
+                break;
+            }
+            // We always append the huge char::MAX offset to the end which
+            // should never be able to fit into the u8 offsets.
+            assert!(inserted);
+        }
+
+        writeln!(
+            &mut self.file,
+            "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];",
+            short_offset_runs.len(),
+            fmt_list(short_offset_runs.iter().map(|v| v.pack()))
+        )
+        .unwrap();
+        self.bytes_used += 4 * short_offset_runs.len();
+        writeln!(
+            &mut self.file,
+            "static OFFSETS: [u8; {}] = [{}];",
+            coded_offsets.len(),
+            fmt_list(&coded_offsets)
+        )
+        .unwrap();
+        self.bytes_used += coded_offsets.len();
+
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    super::skip_search(",).unwrap();
+        writeln!(&mut self.file, "        c as u32,").unwrap();
+        writeln!(&mut self.file, "        &SHORT_OFFSET_RUNS,").unwrap();
+        writeln!(&mut self.file, "        &OFFSETS,").unwrap();
+        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
+    }
+}