From 064f8885d5e1d38673783d626d9d3fc1b7b909f4 Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Mon, 13 Jan 2020 16:40:19 -0500 Subject: [PATCH] Add unicode table generator --- .gitignore | 9 +- Cargo.lock | 17 ++ Cargo.toml | 1 + src/tools/unicode-table-generator/Cargo.toml | 10 + .../src/case_mapping.rs | 62 +++++ src/tools/unicode-table-generator/src/main.rs | 261 ++++++++++++++++++ .../src/raw_emitter.rs | 170 ++++++++++++ .../src/unicode_download.rs | 42 +++ 8 files changed, 564 insertions(+), 8 deletions(-) create mode 100644 src/tools/unicode-table-generator/Cargo.toml create mode 100644 src/tools/unicode-table-generator/src/case_mapping.rs create mode 100644 src/tools/unicode-table-generator/src/main.rs create mode 100644 src/tools/unicode-table-generator/src/raw_emitter.rs create mode 100644 src/tools/unicode-table-generator/src/unicode_download.rs diff --git a/.gitignore b/.gitignore index 1428ee6c9bc..d9761ce4092 100644 --- a/.gitignore +++ b/.gitignore @@ -34,14 +34,7 @@ __pycache__/ # Created by default with `src/ci/docker/run.sh`: /obj/ /rustllvm/ -/src/libcore/unicode/DerivedCoreProperties.txt -/src/libcore/unicode/DerivedNormalizationProps.txt -/src/libcore/unicode/PropList.txt -/src/libcore/unicode/ReadMe.txt -/src/libcore/unicode/Scripts.txt -/src/libcore/unicode/SpecialCasing.txt -/src/libcore/unicode/UnicodeData.txt -/src/libcore/unicode/downloaded +/unicode-downloads /target/ # Generated by compiletest for incremental: /tmp/ diff --git a/Cargo.lock b/Cargo.lock index 4836e15cd79..3f1058645d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4930,6 +4930,16 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169" +[[package]] +name = "ucd-parse" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4" +dependencies = [ + "lazy_static 1.3.0", + "regex", +] + [[package]] name = "ucd-trie" version = "0.1.1" @@ -4951,6 +4961,13 @@ dependencies = [ "version_check 0.1.5", ] +[[package]] +name = "unicode-bdd" +version = "0.1.0" +dependencies = [ + "ucd-parse", +] + [[package]] name = "unicode-bidi" version = "0.3.4" diff --git a/Cargo.toml b/Cargo.toml index a242f090fbc..9d5c27b96df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "src/tools/rustfmt", "src/tools/miri", "src/tools/rustdoc-themes", + "src/tools/unicode-table-generator", ] exclude = [ "build", diff --git a/src/tools/unicode-table-generator/Cargo.toml b/src/tools/unicode-table-generator/Cargo.toml new file mode 100644 index 00000000000..92344cdfc89 --- /dev/null +++ b/src/tools/unicode-table-generator/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "unicode-bdd" +version = "0.1.0" +authors = ["Mark Rousskov "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +ucd-parse = "0.1.3" diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs new file mode 100644 index 00000000000..01f199c213e --- /dev/null +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -0,0 +1,62 @@ +use crate::{fmt_list, UnicodeData}; +use std::fmt; + +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { + let mut file = String::new(); + + file.push_str(HEADER.trim_start()); + + let decl_type = "&[(char, [char; 3])]"; + + file.push_str(&format!( + "static LOWERCASE_TABLE: {} = &[{}];", + decl_type, + fmt_list(data.to_lower.iter().map(to_mapping)) + )); + file.push_str("\n\n"); + file.push_str(&format!( + "static UPPERCASE_TABLE: {} = &[{}];", + decl_type, + fmt_list(data.to_upper.iter().map(to_mapping)) + )); + file +} + +fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) { + ( + CharEscape(std::char::from_u32(*key).unwrap()), + [ + CharEscape(std::char::from_u32(*a).unwrap()), + CharEscape(std::char::from_u32(*b).unwrap()), + CharEscape(std::char::from_u32(*c).unwrap()), + ], + ) +} + +struct CharEscape(char); + +impl fmt::Debug for CharEscape { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "'{}'", self.0.escape_default()) + } +} + +static HEADER: &str = " +pub fn to_lower(c: char) -> [char; 3] { + match bsearch_case_table(c, LOWERCASE_TABLE) { + None => [c, '\\0', '\\0'], + Some(index) => LOWERCASE_TABLE[index].1, + } +} + +pub fn to_upper(c: char) -> [char; 3] { + match bsearch_case_table(c, UPPERCASE_TABLE) { + None => [c, '\\0', '\\0'], + Some(index) => UPPERCASE_TABLE[index].1, + } +} + +fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option { + table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() +} +"; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs new file mode 100644 index 00000000000..be8508e3973 --- /dev/null +++ b/src/tools/unicode-table-generator/src/main.rs @@ -0,0 +1,261 @@ +use std::collections::{BTreeMap, HashMap}; +use std::ops::Range; +use ucd_parse::Codepoints; + +mod case_mapping; +mod raw_emitter; +mod unicode_download; + +use raw_emitter::{emit_codepoints, RawEmitter}; + +static PROPERTIES: &[&str] = &[ + "Alphabetic", + "Lowercase", + "Uppercase", + "Cased", + "Case_Ignorable", + "Grapheme_Extend", + "White_Space", + "Cc", + "N", +]; + +struct UnicodeData { + ranges: Vec<(&'static str, Vec>)>, + to_upper: BTreeMap, + to_lower: BTreeMap, +} + +fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { + let mut a = None; + let mut b = None; + let mut c = None; + + for codepoint in codepoints { + if origin == codepoint.value() { + return None; + } + + if a.is_none() { + a = Some(codepoint.value()); + } else if b.is_none() { + b = Some(codepoint.value()); + } else if c.is_none() { + c = Some(codepoint.value()); + } else { + panic!("more than 3 mapped codepoints") + } + } + + Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) +} + +static UNICODE_DIRECTORY: &str = "unicode-downloads"; + +fn load_data() -> UnicodeData { + unicode_download::fetch_latest(); + + let mut properties = HashMap::new(); + for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() { + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); + } + } + for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() { + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); + } + } + + let mut to_lower = BTreeMap::new(); + let mut to_upper = BTreeMap::new(); + for row in ucd_parse::UnicodeDataExpander::new( + ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), + ) { + let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) { + "N" + } else { + row.general_category.as_str() + }; + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { + properties + .entry(*name) + .or_insert_with(Vec::new) + .push(Codepoints::Single(row.codepoint)); + } + + if let Some(mapped) = row.simple_lowercase_mapping { + if mapped != row.codepoint { + to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + } + } + if let Some(mapped) = row.simple_uppercase_mapping { + if mapped != row.codepoint { + to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + } + } + } + + for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { + if !row.conditions.is_empty() { + // Skip conditional case mappings + continue; + } + + let key = row.codepoint.value(); + if let Some(lower) = to_mapping(key, row.lowercase) { + to_lower.insert(key, lower); + } + if let Some(upper) = to_mapping(key, row.uppercase) { + to_upper.insert(key, upper); + } + } + + let mut properties: HashMap<&'static str, Vec>> = properties + .into_iter() + .map(|(k, v)| { + ( + k, + v.into_iter() + .flat_map(|codepoints| match codepoints { + Codepoints::Single(c) => c + .scalar() + .map(|ch| (ch as u32..ch as u32 + 1)) + .into_iter() + .collect::>(), + Codepoints::Range(c) => c + .into_iter() + .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1))) + .collect::>(), + }) + .collect::>>(), + ) + }) + .collect(); + + for ranges in properties.values_mut() { + merge_ranges(ranges); + } + + let mut properties = properties.into_iter().collect::>(); + properties.sort_by_key(|p| p.0); + UnicodeData { ranges: properties, to_lower, to_upper } +} + +fn main() { + let write_location = std::env::args().nth(1).unwrap_or_else(|| { + eprintln!("Must provide path to write unicode tables to"); + eprintln!( + "e.g. {} src/libcore/unicode/unicode_data.rs", + std::env::args().nth(0).unwrap_or_default() + ); + std::process::exit(1); + }); + + let unicode_data = load_data(); + let ranges_by_property = &unicode_data.ranges; + + let mut total_bytes = 0; + let mut modules = Vec::new(); + for (property, ranges) in ranges_by_property { + let datapoints = ranges.iter().map(|r| r.end - r.start).sum::(); + let mut emitter = RawEmitter::new(); + emit_codepoints(&mut emitter, &ranges); + + modules.push((property.to_lowercase().to_string(), emitter.file)); + println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,); + total_bytes += emitter.bytes_used; + } + + let mut table_file = String::new(); + + table_file.push_str( + "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n", + ); + + table_file.push_str("use super::range_search;\n\n"); + + table_file.push_str(&version()); + + table_file.push('\n'); + + modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + + for (name, contents) in modules { + table_file.push_str("#[rustfmt::skip]\n"); + table_file.push_str(&format!("pub mod {} {{\n", name)); + for line in contents.lines() { + if !line.trim().is_empty() { + table_file.push_str(" "); + table_file.push_str(&line); + } + table_file.push('\n'); + } + table_file.push_str("}\n\n"); + } + + std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); + + println!("Total table sizes: {} bytes", total_bytes); +} + +fn version() -> String { + let mut out = String::new(); + out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = "); + + let readme = + std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt")) + .unwrap(); + + let prefix = "for Version "; + let start = readme.find(prefix).unwrap() + prefix.len(); + let end = readme.find(" of the Unicode Standard.").unwrap(); + let version = + readme[start..end].split('.').map(|v| v.parse::().expect(&v)).collect::>(); + let [major, minor, micro] = [version[0], version[1], version[2]]; + + out.push_str(&format!("({}, {}, {});\n", major, minor, micro)); + out +} + +fn fmt_list(values: impl IntoIterator) -> String { + let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::>(); + let mut out = String::new(); + let mut line = format!("\n "); + for piece in pieces { + if line.len() + piece.len() < 98 { + line.push_str(&piece); + } else { + out.push_str(line.trim_end()); + out.push('\n'); + line = format!(" {}", piece); + } + } + out.push_str(line.trim_end()); + out.push('\n'); + out +} + +fn merge_ranges(ranges: &mut Vec>) { + loop { + let mut new_ranges = Vec::new(); + let mut idx_iter = 0..(ranges.len() - 1); + while let Some(idx) = idx_iter.next() { + let cur = ranges[idx].clone(); + let next = ranges[idx + 1].clone(); + if cur.end == next.start { + let _ = idx_iter.next(); // skip next as we're merging it in + new_ranges.push(cur.start..next.end); + } else { + new_ranges.push(cur); + } + } + new_ranges.push(ranges.last().unwrap().clone()); + if new_ranges.len() == ranges.len() { + *ranges = new_ranges; + break; + } else { + *ranges = new_ranges; + } + } +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs new file mode 100644 index 00000000000..3e60ce13f92 --- /dev/null +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -0,0 +1,170 @@ +//! This implements the core logic of the compression scheme used to compactly +//! encode the Unicode character classes. +//! +//! The primary idea is that we 'flatten' the Unicode ranges into an enormous +//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need +//! over 17 kilobytes of data per character set -- way too much for our +//! purposes. +//! +//! We have two primary goals with the encoding: we want to be compact, because +//! these tables often end up in ~every Rust program (especially the +//! grapheme_extend table, used for str debugging), including those for embedded +//! targets (where space is important). We also want to be relatively fast, +//! though this is more of a nice to have rather than a key design constraint. +//! In practice, due to modern processor design these two are closely related. +//! +//! The encoding scheme here compresses the bitset by first deduplicating the +//! "words" (64 bits on all platforms). In practice very few words are present +//! in most data sets. +//! +//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256 +//! words, we could go to u16 -> word or have some dual compression scheme +//! mapping into two separate sets; currently this is not dealt with). +//! +//! With that scheme, we now have a single byte for every 64 codepoints. We +//! further group these by 16 (arbitrarily chosen), and again deduplicate and +//! store in an array (u8 -> [u8; 16]). +//! +//! The indices into this array represent ranges of 64*16 = 1024 codepoints. +//! +//! This already reduces the top-level array to at most 1,086 bytes, but in +//! practice we usually can encode in far fewer (the first couple Unicode planes +//! are dense). +//! +//! The last byte of this top-level array is pulled out to a separate static +//! and trailing zeros are dropped; this is simply because grapheme_extend and +//! case_ignorable have a single entry in the 896th entry, so this shrinks them +//! down considerably. + +use crate::fmt_list; +use std::collections::{BTreeSet, HashMap}; +use std::convert::TryFrom; +use std::fmt::Write; +use std::ops::Range; + +pub struct RawEmitter { + pub file: String, + pub bytes_used: usize, +} + +impl RawEmitter { + pub fn new() -> RawEmitter { + RawEmitter { file: String::new(), bytes_used: 0 } + } + + fn blank_line(&mut self) { + if self.file.is_empty() || self.file.ends_with("\n\n") { + return; + } + writeln!(&mut self.file, "").unwrap(); + } + + fn emit_bitset(&mut self, words: &[u64]) { + let unique_words = + words.iter().cloned().collect::>().into_iter().collect::>(); + if unique_words.len() > u8::max_value() as usize { + panic!("cannot pack {} into 8 bits", unique_words.len()); + } + + let word_indices = unique_words + .iter() + .cloned() + .enumerate() + .map(|(idx, word)| (word, u8::try_from(idx).unwrap())) + .collect::>(); + + let mut idx = words.iter().map(|w| word_indices[w]).collect::>(); + let chunk_length = 16; + for _ in 0..(chunk_length - (idx.len() % chunk_length)) { + assert_eq!(unique_words[0], 0, "first word is all zeros"); + // pad out bitset index with zero words so we have all chunks of 16 + idx.push(0); + } + + let mut chunks = BTreeSet::new(); + for chunk in idx.chunks(chunk_length) { + chunks.insert(chunk); + } + let chunk_map = chunks + .clone() + .into_iter() + .enumerate() + .map(|(idx, chunk)| (chunk, idx)) + .collect::>(); + let mut chunk_indices = Vec::new(); + for chunk in idx.chunks(chunk_length) { + chunk_indices.push(chunk_map[chunk]); + } + writeln!( + &mut self.file, + "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});", + chunk_indices.len() - 1, + chunk_indices.pop().unwrap(), + ) + .unwrap(); + self.bytes_used += 3; + // Strip out the empty pieces, presuming our above pop() made us now + // have some trailing zeros. + assert_eq!(unique_words[0], 0, "first word is all zeros"); + while let Some(0) = chunk_indices.last() { + chunk_indices.pop(); + } + writeln!( + &mut self.file, + "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", + chunk_indices.len(), + fmt_list(&chunk_indices), + ) + .unwrap(); + self.bytes_used += chunk_indices.len(); + writeln!( + &mut self.file, + "static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];", + chunks.len(), + fmt_list(chunks.iter()), + ) + .unwrap(); + self.bytes_used += 16 * chunks.len(); + writeln!( + &mut self.file, + "static BITSET: [u64; {}] = [{}];", + unique_words.len(), + fmt_list(&unique_words), + ) + .unwrap(); + self.bytes_used += 8 * unique_words.len(); + } + + pub fn emit_lookup(&mut self) { + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::range_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); + writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); + writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); + writeln!(&mut self.file, " &BITSET,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + } +} + +pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { + emitter.blank_line(); + + let last_code_point = ranges.last().unwrap().end; + // bitset for every bit in the codepoint range + // + // + 2 to ensure an all zero word to use for padding + let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; + for range in ranges { + for codepoint in range.clone() { + let bucket = codepoint as usize / 64; + let bit = codepoint as u64 % 64; + buckets[bucket] |= 1 << bit; + } + } + + emitter.emit_bitset(&buckets); + emitter.blank_line(); + emitter.emit_lookup(); +} diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs new file mode 100644 index 00000000000..3f6de9ea3bb --- /dev/null +++ b/src/tools/unicode-table-generator/src/unicode_download.rs @@ -0,0 +1,42 @@ +use crate::UNICODE_DIRECTORY; +use std::path::Path; +use std::process::Command; + +static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/"; + +static README: &str = "ReadMe.txt"; + +static RESOURCES: &[&str] = + &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"]; + +pub fn fetch_latest() { + let directory = Path::new(UNICODE_DIRECTORY); + if let Err(e) = std::fs::create_dir_all(directory) { + if e.kind() != std::io::ErrorKind::AlreadyExists { + panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e); + } + } + let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap(); + if !output.status.success() { + panic!( + "Failed to run curl to fetch readme: stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default(); + if current.as_bytes() != &output.stdout[..] { + std::fs::write(directory.join(README), output.stdout).unwrap(); + } + + for resource in RESOURCES { + let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap(); + if !output.status.success() { + panic!( + "Failed to run curl to fetch {}: stderr: {}", + resource, + String::from_utf8_lossy(&output.stderr) + ); + } + std::fs::write(directory.join(resource), output.stdout).unwrap(); + } +}