mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-23 15:23:46 +00:00
Auto merge of #70499 - Dylan-DPC:rollup-f9je1l8, r=Dylan-DPC
Rollup of 5 pull requests Successful merges: - #70418 (Add long error explanation for E0703) - #70448 (Create output dir in rustdoc markdown render) - #70486 (Shrink Unicode tables (even more)) - #70493 (Fix rustdoc.css CSS tab-size property) - #70495 (Replace last mention of IRC with Discord) Failed merges: r? @ghost
This commit is contained in:
commit
c52cee172f
@ -32,28 +32,3 @@ pub use unicode_data::lowercase::lookup as Lowercase;
|
||||
pub use unicode_data::n::lookup as N;
|
||||
pub use unicode_data::uppercase::lookup as Uppercase;
|
||||
pub use unicode_data::white_space::lookup as White_Space;
|
||||
|
||||
#[inline(always)]
|
||||
fn range_search<const N: usize, const N1: usize, const N2: usize>(
|
||||
needle: u32,
|
||||
chunk_idx_map: &[u8; N],
|
||||
(last_chunk_idx, last_chunk_mapping): (u16, u8),
|
||||
bitset_chunk_idx: &[[u8; 16]; N1],
|
||||
bitset: &[u64; N2],
|
||||
) -> bool {
|
||||
let bucket_idx = (needle / 64) as usize;
|
||||
let chunk_map_idx = bucket_idx / 16;
|
||||
let chunk_piece = bucket_idx % 16;
|
||||
let chunk_idx = if chunk_map_idx >= N {
|
||||
if chunk_map_idx == last_chunk_idx as usize {
|
||||
last_chunk_mapping
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
chunk_idx_map[chunk_map_idx]
|
||||
};
|
||||
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
|
||||
let word = bitset[(idx as usize)];
|
||||
(word & (1 << (needle % 64) as u64)) != 0
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -389,6 +389,7 @@ E0698: include_str!("./error_codes/E0698.md"),
|
||||
E0699: include_str!("./error_codes/E0699.md"),
|
||||
E0700: include_str!("./error_codes/E0700.md"),
|
||||
E0701: include_str!("./error_codes/E0701.md"),
|
||||
E0703: include_str!("./error_codes/E0703.md"),
|
||||
E0704: include_str!("./error_codes/E0704.md"),
|
||||
E0705: include_str!("./error_codes/E0705.md"),
|
||||
E0706: include_str!("./error_codes/E0706.md"),
|
||||
@ -603,7 +604,6 @@ E0751: include_str!("./error_codes/E0751.md"),
|
||||
// E0694, // an unknown tool name found in scoped attributes
|
||||
E0696, // `continue` pointing to a labeled block
|
||||
// E0702, // replaced with a generic attribute input check
|
||||
E0703, // invalid ABI
|
||||
// E0707, // multiple elided lifetimes used in arguments of `async fn`
|
||||
E0708, // `async` non-`move` closures with parameters are not currently
|
||||
// supported
|
||||
|
17
src/librustc_error_codes/error_codes/E0703.md
Normal file
17
src/librustc_error_codes/error_codes/E0703.md
Normal file
@ -0,0 +1,17 @@
|
||||
Invalid ABI (Application Binary Interface) used in the code.
|
||||
|
||||
Erroneous code example:
|
||||
|
||||
```compile_fail,E0703
|
||||
extern "invalid" fn foo() {} // error!
|
||||
# fn main() {}
|
||||
```
|
||||
|
||||
At present few predefined ABI's (like Rust, C, system, etc.) can be
|
||||
used in Rust. Verify that the ABI is predefined. For example you can
|
||||
replace the given ABI from 'Rust'.
|
||||
|
||||
```
|
||||
extern "Rust" fn foo() {} // ok!
|
||||
# fn main() { }
|
||||
```
|
@ -1082,8 +1082,8 @@ h3 > .collapse-toggle, h4 > .collapse-toggle {
|
||||
|
||||
pre.rust {
|
||||
position: relative;
|
||||
tab-width: 4;
|
||||
-moz-tab-width: 4;
|
||||
tab-size: 4;
|
||||
-moz-tab-size: 4;
|
||||
}
|
||||
|
||||
.search-failed {
|
||||
|
@ -1,4 +1,4 @@
|
||||
use std::fs::File;
|
||||
use std::fs::{create_dir_all, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@ -40,6 +40,11 @@ pub fn render(
|
||||
diag: &rustc_errors::Handler,
|
||||
edition: Edition,
|
||||
) -> i32 {
|
||||
if let Err(e) = create_dir_all(&options.output) {
|
||||
diag.struct_err(&format!("{}: {}", options.output.display(), e)).emit();
|
||||
return 4;
|
||||
}
|
||||
|
||||
let mut output = options.output;
|
||||
output.push(input.file_name().unwrap());
|
||||
output.set_extension("html");
|
||||
|
@ -91,7 +91,8 @@
|
||||
//! pull-requests for your suggested changes.
|
||||
//!
|
||||
//! Contributions are appreciated! If you see a part of the docs that can be
|
||||
//! improved, submit a PR, or chat with us first on irc.mozilla.org #rust-docs.
|
||||
//! improved, submit a PR, or chat with us first on [Discord][rust-discord]
|
||||
//! #docs.
|
||||
//!
|
||||
//! # A Tour of The Rust Standard Library
|
||||
//!
|
||||
@ -194,6 +195,7 @@
|
||||
//! [multithreading]: thread/index.html
|
||||
//! [other]: #what-is-in-the-standard-library-documentation
|
||||
//! [primitive types]: ../book/ch03-02-data-types.html
|
||||
//! [rust-discord]: https://discord.gg/rust-lang
|
||||
|
||||
#![stable(feature = "rust1", since = "1.0.0")]
|
||||
#![doc(
|
||||
|
@ -8,3 +8,4 @@ LL | extern "路濫狼á́́" fn foo() {}
|
||||
|
||||
error: aborting due to previous error
|
||||
|
||||
For more information about this error, try `rustc --explain E0703`.
|
||||
|
@ -8,3 +8,4 @@ LL | "invalid-ab_isize"
|
||||
|
||||
error: aborting due to previous error
|
||||
|
||||
For more information about this error, try `rustc --explain E0703`.
|
||||
|
@ -1,9 +1,83 @@
|
||||
//! This implements the core logic of the compression scheme used to compactly
|
||||
//! encode Unicode properties.
|
||||
//!
|
||||
//! We have two primary goals with the encoding: we want to be compact, because
|
||||
//! these tables often end up in ~every Rust program (especially the
|
||||
//! grapheme_extend table, used for str debugging), including those for embedded
|
||||
//! targets (where space is important). We also want to be relatively fast,
|
||||
//! though this is more of a nice to have rather than a key design constraint.
|
||||
//! It is expected that libraries/applications which are performance-sensitive
|
||||
//! to Unicode property lookups are extremely rare, and those that care may find
|
||||
//! the tradeoff of the raw bitsets worth it. For most applications, a
|
||||
//! relatively fast but much smaller (and as such less cache-impacting, etc.)
|
||||
//! data set is likely preferable.
|
||||
//!
|
||||
//! We have two separate encoding schemes: a skiplist-like approach, and a
|
||||
//! compressed bitset. The datasets we consider mostly use the skiplist (it's
|
||||
//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
|
||||
//! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
|
||||
//! Since the bitset is also faster, this seems an obvious choice. (As a
|
||||
//! historical note, the bitset was also the prior implementation, so its
|
||||
//! relative complexity had already been paid).
|
||||
//!
|
||||
//! ## The bitset
|
||||
//!
|
||||
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
|
||||
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
|
||||
//! over 17 kilobytes of data per character set -- way too much for our
|
||||
//! purposes.
|
||||
//!
|
||||
//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
|
||||
//! not skipping the small 'gap') is associated into words (u64) and
|
||||
//! deduplicated. On random data, this would be useless; on our data, this is
|
||||
//! incredibly beneficial -- our data sets have (far) less than 256 unique
|
||||
//! words.
|
||||
//!
|
||||
//! This gives us an array that maps `u8 -> word`; the current algorithm does
|
||||
//! not handle the case of more than 256 unique words, but we are relatively far
|
||||
//! from coming that close.
|
||||
//!
|
||||
//! With that scheme, we now have a single byte for every 64 codepoints.
|
||||
//!
|
||||
//! We further chunk these by some constant N (between 1 and 64 per group,
|
||||
//! dynamically chosen for smallest size), and again deduplicate and store in an
|
||||
//! array (u8 -> [u8; N]).
|
||||
//!
|
||||
//! The bytes of this array map into the words from the bitset above, but we
|
||||
//! apply another trick here: some of these words are similar enough that they
|
||||
//! can be represented by some function of another word. The particular
|
||||
//! functions chosen are rotation, inversion, and shifting (right).
|
||||
//!
|
||||
//! ## The skiplist
|
||||
//!
|
||||
//! The skip list arose out of the desire for an even smaller encoding than the
|
||||
//! bitset -- and was the answer to the question "what is the smallest
|
||||
//! representation we can imagine?". However, it is not necessarily the
|
||||
//! smallest, and if you have a better proposal, please do suggest it!
|
||||
//!
|
||||
//! This is a relatively straightforward encoding. First, we break up all the
|
||||
//! ranges in the input data into offsets from each other, essentially a gap
|
||||
//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
|
||||
//! store those directly. We make use of the larger gaps (which are nicely
|
||||
//! interspersed already) throughout the dataset to index this data set.
|
||||
//!
|
||||
//! In particular, each run of small gaps (terminating in a large gap) is
|
||||
//! indexed in a separate dataset. That data set stores an index into the
|
||||
//! primary offset list and a prefix sum of that offset list. These are packed
|
||||
//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
|
||||
//!
|
||||
//! Lookup proceeds via a binary search in the index and then a straightforward
|
||||
//! linear scan (adding up the offsets) until we reach the needle, and then the
|
||||
//! index of that offset is utilized as the answer to whether we're in the set
|
||||
//! or not.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Range;
|
||||
use ucd_parse::Codepoints;
|
||||
|
||||
mod case_mapping;
|
||||
mod raw_emitter;
|
||||
mod skiplist;
|
||||
mod unicode_download;
|
||||
|
||||
use raw_emitter::{emit_codepoints, RawEmitter};
|
||||
@ -152,9 +226,17 @@ fn main() {
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
// Optional test path, which is a Rust source file testing that the unicode
|
||||
// property lookups are correct.
|
||||
let test_path = std::env::args().nth(2);
|
||||
|
||||
let unicode_data = load_data();
|
||||
let ranges_by_property = &unicode_data.ranges;
|
||||
|
||||
if let Some(path) = test_path {
|
||||
std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
|
||||
}
|
||||
|
||||
let mut total_bytes = 0;
|
||||
let mut modules = Vec::new();
|
||||
for (property, ranges) in ranges_by_property {
|
||||
@ -163,7 +245,16 @@ fn main() {
|
||||
emit_codepoints(&mut emitter, &ranges);
|
||||
|
||||
modules.push((property.to_lowercase().to_string(), emitter.file));
|
||||
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
|
||||
println!(
|
||||
"{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
|
||||
property,
|
||||
emitter.bytes_used,
|
||||
datapoints,
|
||||
ranges.len(),
|
||||
ranges.first().unwrap().start,
|
||||
ranges.last().unwrap().end,
|
||||
emitter.desc,
|
||||
);
|
||||
total_bytes += emitter.bytes_used;
|
||||
}
|
||||
|
||||
@ -173,7 +264,10 @@ fn main() {
|
||||
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
|
||||
);
|
||||
|
||||
table_file.push_str("use super::range_search;\n\n");
|
||||
// Include the range search function
|
||||
table_file.push('\n');
|
||||
table_file.push_str(include_str!("range_search.rs"));
|
||||
table_file.push('\n');
|
||||
|
||||
table_file.push_str(&version());
|
||||
|
||||
@ -236,21 +330,97 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
|
||||
out
|
||||
}
|
||||
|
||||
fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
|
||||
let mut s = String::new();
|
||||
s.push_str("#![allow(incomplete_features, unused)]\n");
|
||||
s.push_str("#![feature(const_generics)]\n\n");
|
||||
s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
|
||||
s.push_str(&format!("#[path = \"{}\"]\n", data_path));
|
||||
s.push_str("mod unicode_data;\n\n");
|
||||
|
||||
s.push_str("\nfn main() {\n");
|
||||
|
||||
for (property, ranges) in ranges {
|
||||
s.push_str(&format!(r#" println!("Testing {}");"#, property));
|
||||
s.push('\n');
|
||||
s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
|
||||
s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
|
||||
let mut is_true = Vec::new();
|
||||
let mut is_false = Vec::new();
|
||||
for ch_num in 0..(std::char::MAX as u32) {
|
||||
if std::char::from_u32(ch_num).is_none() {
|
||||
continue;
|
||||
}
|
||||
if ranges.iter().any(|r| r.contains(&ch_num)) {
|
||||
is_true.push(ch_num);
|
||||
} else {
|
||||
is_false.push(ch_num);
|
||||
}
|
||||
}
|
||||
|
||||
s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase()));
|
||||
generate_asserts(&mut s, property, &is_true, true);
|
||||
s.push_str(" }\n\n");
|
||||
s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase()));
|
||||
generate_asserts(&mut s, property, &is_false, false);
|
||||
s.push_str(" }\n\n");
|
||||
}
|
||||
|
||||
s.push_str("}");
|
||||
s
|
||||
}
|
||||
|
||||
fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
|
||||
for range in ranges_from_set(points) {
|
||||
if range.end == range.start + 1 {
|
||||
s.push_str(&format!(
|
||||
" assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
|
||||
if truthy { "" } else { "!" },
|
||||
property.to_lowercase(),
|
||||
std::char::from_u32(range.start).unwrap(),
|
||||
range.start,
|
||||
));
|
||||
} else {
|
||||
s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
|
||||
s.push_str(&format!(
|
||||
" assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
|
||||
if truthy { "" } else { "!" },
|
||||
property.to_lowercase(),
|
||||
));
|
||||
s.push_str(" }\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
|
||||
let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
|
||||
merge_ranges(&mut ranges);
|
||||
ranges
|
||||
}
|
||||
|
||||
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
|
||||
loop {
|
||||
let mut new_ranges = Vec::new();
|
||||
let mut idx_iter = 0..(ranges.len() - 1);
|
||||
let mut should_insert_last = true;
|
||||
while let Some(idx) = idx_iter.next() {
|
||||
let cur = ranges[idx].clone();
|
||||
let next = ranges[idx + 1].clone();
|
||||
if cur.end == next.start {
|
||||
let _ = idx_iter.next(); // skip next as we're merging it in
|
||||
if idx_iter.next().is_none() {
|
||||
// We're merging the last element
|
||||
should_insert_last = false;
|
||||
}
|
||||
new_ranges.push(cur.start..next.end);
|
||||
} else {
|
||||
// We're *not* merging the last element
|
||||
should_insert_last = true;
|
||||
new_ranges.push(cur);
|
||||
}
|
||||
}
|
||||
new_ranges.push(ranges.last().unwrap().clone());
|
||||
if should_insert_last {
|
||||
new_ranges.push(ranges.last().unwrap().clone());
|
||||
}
|
||||
if new_ranges.len() == ranges.len() {
|
||||
*ranges = new_ranges;
|
||||
break;
|
||||
@ -258,4 +428,12 @@ fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
|
||||
*ranges = new_ranges;
|
||||
}
|
||||
}
|
||||
|
||||
let mut last_end = None;
|
||||
for range in ranges {
|
||||
if let Some(last) = last_end {
|
||||
assert!(range.start > last, "{:?}", range);
|
||||
}
|
||||
last_end = Some(range.end);
|
||||
}
|
||||
}
|
||||
|
93
src/tools/unicode-table-generator/src/range_search.rs
Normal file
93
src/tools/unicode-table-generator/src/range_search.rs
Normal file
@ -0,0 +1,93 @@
|
||||
#[inline(always)]
|
||||
fn bitset_search<
|
||||
const N: usize,
|
||||
const CHUNK_SIZE: usize,
|
||||
const N1: usize,
|
||||
const CANONICAL: usize,
|
||||
const CANONICALIZED: usize,
|
||||
>(
|
||||
needle: u32,
|
||||
chunk_idx_map: &[u8; N],
|
||||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
||||
bitset_canonical: &[u64; CANONICAL],
|
||||
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
||||
) -> bool {
|
||||
let bucket_idx = (needle / 64) as usize;
|
||||
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
||||
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
||||
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
|
||||
v
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
|
||||
let word = if let Some(word) = bitset_canonical.get(idx) {
|
||||
*word
|
||||
} else {
|
||||
let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
|
||||
let mut word = bitset_canonical[real_idx as usize];
|
||||
let should_invert = mapping & (1 << 6) != 0;
|
||||
if should_invert {
|
||||
word = !word;
|
||||
}
|
||||
// Lower 6 bits
|
||||
let quantity = mapping & ((1 << 6) - 1);
|
||||
if mapping & (1 << 7) != 0 {
|
||||
// shift
|
||||
word >>= quantity as u64;
|
||||
} else {
|
||||
word = word.rotate_left(quantity as u32);
|
||||
}
|
||||
word
|
||||
};
|
||||
(word & (1 << (needle % 64) as u64)) != 0
|
||||
}
|
||||
|
||||
fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
|
||||
short_offset_run_header & ((1 << 21) - 1)
|
||||
}
|
||||
|
||||
fn decode_length(short_offset_run_header: u32) -> usize {
|
||||
(short_offset_run_header >> 21) as usize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn skip_search<const SOR: usize, const OFFSETS: usize>(
|
||||
needle: u32,
|
||||
short_offset_runs: &[u32; SOR],
|
||||
offsets: &[u8; OFFSETS],
|
||||
) -> bool {
|
||||
// Note that this *cannot* be past the end of the array, as the last
|
||||
// element is greater than std::char::MAX (the largest possible needle).
|
||||
//
|
||||
// So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
|
||||
// location cannot be past it, so Err(idx) != length either.
|
||||
//
|
||||
// This means that we can avoid bounds checking for the accesses below, too.
|
||||
let last_idx =
|
||||
match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
};
|
||||
|
||||
let mut offset_idx = decode_length(short_offset_runs[last_idx]);
|
||||
let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
|
||||
decode_length(*next) - offset_idx
|
||||
} else {
|
||||
offsets.len() - offset_idx
|
||||
};
|
||||
let prev =
|
||||
last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0);
|
||||
|
||||
let total = needle - prev;
|
||||
let mut prefix_sum = 0;
|
||||
for _ in 0..(length - 1) {
|
||||
let offset = offsets[offset_idx];
|
||||
prefix_sum += offset as u32;
|
||||
if prefix_sum > total {
|
||||
break;
|
||||
}
|
||||
offset_idx += 1;
|
||||
}
|
||||
offset_idx % 2 == 1
|
||||
}
|
@ -1,55 +1,19 @@
|
||||
//! This implements the core logic of the compression scheme used to compactly
|
||||
//! encode the Unicode character classes.
|
||||
//!
|
||||
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
|
||||
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
|
||||
//! over 17 kilobytes of data per character set -- way too much for our
|
||||
//! purposes.
|
||||
//!
|
||||
//! We have two primary goals with the encoding: we want to be compact, because
|
||||
//! these tables often end up in ~every Rust program (especially the
|
||||
//! grapheme_extend table, used for str debugging), including those for embedded
|
||||
//! targets (where space is important). We also want to be relatively fast,
|
||||
//! though this is more of a nice to have rather than a key design constraint.
|
||||
//! In practice, due to modern processor design these two are closely related.
|
||||
//!
|
||||
//! The encoding scheme here compresses the bitset by first deduplicating the
|
||||
//! "words" (64 bits on all platforms). In practice very few words are present
|
||||
//! in most data sets.
|
||||
//!
|
||||
//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
|
||||
//! words, we could go to u16 -> word or have some dual compression scheme
|
||||
//! mapping into two separate sets; currently this is not dealt with).
|
||||
//!
|
||||
//! With that scheme, we now have a single byte for every 64 codepoints. We
|
||||
//! further group these by 16 (arbitrarily chosen), and again deduplicate and
|
||||
//! store in an array (u8 -> [u8; 16]).
|
||||
//!
|
||||
//! The indices into this array represent ranges of 64*16 = 1024 codepoints.
|
||||
//!
|
||||
//! This already reduces the top-level array to at most 1,086 bytes, but in
|
||||
//! practice we usually can encode in far fewer (the first couple Unicode planes
|
||||
//! are dense).
|
||||
//!
|
||||
//! The last byte of this top-level array is pulled out to a separate static
|
||||
//! and trailing zeros are dropped; this is simply because grapheme_extend and
|
||||
//! case_ignorable have a single entry in the 896th entry, so this shrinks them
|
||||
//! down considerably.
|
||||
|
||||
use crate::fmt_list;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap};
|
||||
use std::convert::TryFrom;
|
||||
use std::fmt::Write;
|
||||
use std::fmt::{self, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawEmitter {
|
||||
pub file: String,
|
||||
pub desc: String,
|
||||
pub bytes_used: usize,
|
||||
}
|
||||
|
||||
impl RawEmitter {
|
||||
pub fn new() -> RawEmitter {
|
||||
RawEmitter { file: String::new(), bytes_used: 0 }
|
||||
RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
|
||||
}
|
||||
|
||||
fn blank_line(&mut self) {
|
||||
@ -59,30 +23,100 @@ impl RawEmitter {
|
||||
writeln!(&mut self.file, "").unwrap();
|
||||
}
|
||||
|
||||
fn emit_bitset(&mut self, words: &[u64]) {
|
||||
fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
|
||||
let last_code_point = ranges.last().unwrap().end;
|
||||
// bitset for every bit in the codepoint range
|
||||
//
|
||||
// + 2 to ensure an all zero word to use for padding
|
||||
let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
|
||||
for range in ranges {
|
||||
for codepoint in range.clone() {
|
||||
let bucket = codepoint as usize / 64;
|
||||
let bit = codepoint as u64 % 64;
|
||||
buckets[bucket] |= 1 << bit;
|
||||
}
|
||||
}
|
||||
|
||||
let mut words = buckets;
|
||||
// Ensure that there's a zero word in the dataset, used for padding and
|
||||
// such.
|
||||
words.push(0);
|
||||
let unique_words =
|
||||
words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
|
||||
if unique_words.len() > u8::max_value() as usize {
|
||||
panic!("cannot pack {} into 8 bits", unique_words.len());
|
||||
}
|
||||
// needed for the chunk mapping to work
|
||||
assert_eq!(unique_words[0], 0, "has a zero word");
|
||||
let canonicalized = Canonicalized::canonicalize(&unique_words);
|
||||
|
||||
let word_indices = unique_words
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let word_indices = canonicalized.unique_mapping.clone();
|
||||
let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
|
||||
|
||||
let mut idx = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
|
||||
let chunk_length = 16;
|
||||
for _ in 0..(chunk_length - (idx.len() % chunk_length)) {
|
||||
assert_eq!(unique_words[0], 0, "first word is all zeros");
|
||||
// pad out bitset index with zero words so we have all chunks of 16
|
||||
idx.push(0);
|
||||
let mut best = None;
|
||||
for length in 1..=64 {
|
||||
let mut temp = self.clone();
|
||||
temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
|
||||
if let Some((_, size)) = best {
|
||||
if temp.bytes_used < size {
|
||||
best = Some((length, temp.bytes_used));
|
||||
}
|
||||
} else {
|
||||
best = Some((length, temp.bytes_used));
|
||||
}
|
||||
}
|
||||
self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
|
||||
|
||||
struct Bits(u64);
|
||||
impl fmt::Debug for Bits {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "0b{:064b}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_CANONICAL: [u64; {}] = [{}];",
|
||||
canonicalized.canonical_words.len(),
|
||||
fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += 8 * canonicalized.canonical_words.len();
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
|
||||
canonicalized.canonicalized_words.len(),
|
||||
fmt_list(&canonicalized.canonicalized_words),
|
||||
)
|
||||
.unwrap();
|
||||
// 8 bit index into shifted words, 7 bits for shift + optional flip
|
||||
// We only need it for the words that we removed by applying a shift and
|
||||
// flip to them.
|
||||
self.bytes_used += 2 * canonicalized.canonicalized_words.len();
|
||||
|
||||
self.blank_line();
|
||||
|
||||
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
|
||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
|
||||
writeln!(&mut self.file, " )").unwrap();
|
||||
writeln!(&mut self.file, "}}").unwrap();
|
||||
}
|
||||
|
||||
fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
|
||||
let mut compressed_words = compressed_words.to_vec();
|
||||
for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
|
||||
// pad out bitset index with zero words so we have all chunks of
|
||||
// chunkchunk_length
|
||||
compressed_words.push(zero_at);
|
||||
}
|
||||
|
||||
let mut chunks = BTreeSet::new();
|
||||
for chunk in idx.chunks(chunk_length) {
|
||||
for chunk in compressed_words.chunks(chunk_length) {
|
||||
chunks.insert(chunk);
|
||||
}
|
||||
let chunk_map = chunks
|
||||
@ -92,23 +126,10 @@ impl RawEmitter {
|
||||
.map(|(idx, chunk)| (chunk, idx))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let mut chunk_indices = Vec::new();
|
||||
for chunk in idx.chunks(chunk_length) {
|
||||
for chunk in compressed_words.chunks(chunk_length) {
|
||||
chunk_indices.push(chunk_map[chunk]);
|
||||
}
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
|
||||
chunk_indices.len() - 1,
|
||||
chunk_indices.pop().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += 3;
|
||||
// Strip out the empty pieces, presuming our above pop() made us now
|
||||
// have some trailing zeros.
|
||||
assert_eq!(unique_words[0], 0, "first word is all zeros");
|
||||
while let Some(0) = chunk_indices.last() {
|
||||
chunk_indices.pop();
|
||||
}
|
||||
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
|
||||
@ -119,52 +140,253 @@ impl RawEmitter {
|
||||
self.bytes_used += chunk_indices.len();
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];",
|
||||
"static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
|
||||
chunk_length,
|
||||
chunks.len(),
|
||||
fmt_list(chunks.iter()),
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += 16 * chunks.len();
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET: [u64; {}] = [{}];",
|
||||
unique_words.len(),
|
||||
fmt_list(&unique_words),
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += 8 * unique_words.len();
|
||||
}
|
||||
|
||||
pub fn emit_lookup(&mut self) {
|
||||
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||
writeln!(&mut self.file, " super::range_search(",).unwrap();
|
||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET,").unwrap();
|
||||
writeln!(&mut self.file, " )").unwrap();
|
||||
writeln!(&mut self.file, "}}").unwrap();
|
||||
self.bytes_used += chunk_length * chunks.len();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
|
||||
emitter.blank_line();
|
||||
|
||||
let last_code_point = ranges.last().unwrap().end;
|
||||
// bitset for every bit in the codepoint range
|
||||
//
|
||||
// + 2 to ensure an all zero word to use for padding
|
||||
let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
|
||||
for range in ranges {
|
||||
for codepoint in range.clone() {
|
||||
let bucket = codepoint as usize / 64;
|
||||
let bit = codepoint as u64 % 64;
|
||||
buckets[bucket] |= 1 << bit;
|
||||
}
|
||||
}
|
||||
let mut bitset = emitter.clone();
|
||||
bitset.emit_bitset(&ranges);
|
||||
|
||||
emitter.emit_bitset(&buckets);
|
||||
emitter.blank_line();
|
||||
emitter.emit_lookup();
|
||||
let mut skiplist = emitter.clone();
|
||||
skiplist.emit_skiplist(&ranges);
|
||||
|
||||
if bitset.bytes_used <= skiplist.bytes_used {
|
||||
*emitter = bitset;
|
||||
emitter.desc = format!("bitset");
|
||||
} else {
|
||||
*emitter = skiplist;
|
||||
emitter.desc = format!("skiplist");
|
||||
}
|
||||
}
|
||||
|
||||
struct Canonicalized {
|
||||
canonical_words: Vec<u64>,
|
||||
canonicalized_words: Vec<(u8, u8)>,
|
||||
|
||||
/// Maps an input unique word to the associated index (u8) which is into
|
||||
/// canonical_words or canonicalized_words (in order).
|
||||
unique_mapping: HashMap<u64, u8>,
|
||||
}
|
||||
|
||||
impl Canonicalized {
|
||||
fn canonicalize(unique_words: &[u64]) -> Self {
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
enum Mapping {
|
||||
Rotate(u32),
|
||||
Invert,
|
||||
RotateAndInvert(u32),
|
||||
ShiftRight(u32),
|
||||
}
|
||||
|
||||
// key is the word being mapped to
|
||||
let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
|
||||
for &a in unique_words {
|
||||
'b: for &b in unique_words {
|
||||
// skip self
|
||||
if a == b {
|
||||
continue;
|
||||
}
|
||||
|
||||
// All possible distinct rotations
|
||||
for rotation in 1..64 {
|
||||
if a.rotate_right(rotation) == b {
|
||||
mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
|
||||
// We're not interested in further mappings between a and b
|
||||
continue 'b;
|
||||
}
|
||||
}
|
||||
|
||||
if (!a) == b {
|
||||
mappings.entry(b).or_default().push((a, Mapping::Invert));
|
||||
// We're not interested in further mappings between a and b
|
||||
continue 'b;
|
||||
}
|
||||
|
||||
// All possible distinct rotations, inverted
|
||||
for rotation in 1..64 {
|
||||
if (!a.rotate_right(rotation)) == b {
|
||||
mappings
|
||||
.entry(b)
|
||||
.or_default()
|
||||
.push((a, Mapping::RotateAndInvert(rotation)));
|
||||
// We're not interested in further mappings between a and b
|
||||
continue 'b;
|
||||
}
|
||||
}
|
||||
|
||||
// All possible shifts
|
||||
for shift_by in 1..64 {
|
||||
if a == (b >> shift_by) {
|
||||
mappings
|
||||
.entry(b)
|
||||
.or_default()
|
||||
.push((a, Mapping::ShiftRight(shift_by as u32)));
|
||||
// We're not interested in further mappings between a and b
|
||||
continue 'b;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// These are the bitset words which will be represented "raw" (as a u64)
|
||||
let mut canonical_words = Vec::new();
|
||||
// These are mapped words, which will be represented by an index into
|
||||
// the canonical_words and a Mapping; u16 when encoded.
|
||||
let mut canonicalized_words = Vec::new();
|
||||
let mut unique_mapping = HashMap::new();
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum UniqueMapping {
|
||||
Canonical(usize),
|
||||
Canonicalized(usize),
|
||||
}
|
||||
|
||||
// Map 0 first, so that it is the first canonical word.
|
||||
// This is realistically not inefficient because 0 is not mapped to by
|
||||
// anything else (a shift pattern could do it, but would be wasteful).
|
||||
//
|
||||
// However, 0s are quite common in the overall dataset, and it is quite
|
||||
// wasteful to have to go through a mapping function to determine that
|
||||
// we have a zero.
|
||||
//
|
||||
// FIXME: Experiment with choosing most common words in overall data set
|
||||
// for canonical when possible.
|
||||
while let Some((&to, _)) = mappings
|
||||
.iter()
|
||||
.find(|(&to, _)| to == 0)
|
||||
.or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
|
||||
{
|
||||
// Get the mapping with the most entries. Currently, no mapping can
|
||||
// only exist transitively (i.e., there is no A, B, C such that A
|
||||
// does not map to C and but A maps to B maps to C), so this is
|
||||
// guaranteed to be acceptable.
|
||||
//
|
||||
// In the future, we may need a more sophisticated algorithm to
|
||||
// identify which keys to prefer as canonical.
|
||||
let mapped_from = mappings.remove(&to).unwrap();
|
||||
for (from, how) in &mapped_from {
|
||||
// Remove the entries which mapped to this one.
|
||||
// Noting that it should be associated with the Nth canonical word.
|
||||
//
|
||||
// We do not assert that this is present, because there may be
|
||||
// no mappings to the `from` word; that's fine.
|
||||
mappings.remove(from);
|
||||
assert_eq!(
|
||||
unique_mapping
|
||||
.insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
|
||||
None
|
||||
);
|
||||
canonicalized_words.push((canonical_words.len(), *how));
|
||||
|
||||
// Remove the now-canonicalized word from other mappings,
|
||||
// to ensure that we deprioritize them in the next iteration of
|
||||
// the while loop.
|
||||
for (_, mapped) in &mut mappings {
|
||||
let mut i = 0;
|
||||
while i != mapped.len() {
|
||||
if mapped[i].0 == *from {
|
||||
mapped.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
unique_mapping
|
||||
.insert(to, UniqueMapping::Canonical(canonical_words.len()))
|
||||
.is_none()
|
||||
);
|
||||
canonical_words.push(to);
|
||||
|
||||
// Remove the now-canonical word from other mappings, to ensure that
|
||||
// we deprioritize them in the next iteration of the while loop.
|
||||
for (_, mapped) in &mut mappings {
|
||||
let mut i = 0;
|
||||
while i != mapped.len() {
|
||||
if mapped[i].0 == to {
|
||||
mapped.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any words which we couldn't shrink, just stick into the canonical
|
||||
// words.
|
||||
//
|
||||
// FIXME: work harder -- there are more possibilities for mapping
|
||||
// functions (e.g., multiplication, shifting instead of rotation, etc.)
|
||||
// We'll probably always have some slack though so this loop will still
|
||||
// be needed.
|
||||
for &w in unique_words {
|
||||
if !unique_mapping.contains_key(&w) {
|
||||
assert!(
|
||||
unique_mapping
|
||||
.insert(w, UniqueMapping::Canonical(canonical_words.len()))
|
||||
.is_none()
|
||||
);
|
||||
canonical_words.push(w);
|
||||
}
|
||||
}
|
||||
assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
|
||||
assert_eq!(unique_mapping.len(), unique_words.len());
|
||||
|
||||
let unique_mapping = unique_mapping
|
||||
.into_iter()
|
||||
.map(|(key, value)| {
|
||||
(
|
||||
key,
|
||||
match value {
|
||||
UniqueMapping::Canonicalized(idx) => {
|
||||
u8::try_from(canonical_words.len() + idx).unwrap()
|
||||
}
|
||||
UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let mut distinct_indices = BTreeSet::new();
|
||||
for &w in unique_words {
|
||||
let idx = unique_mapping.get(&w).unwrap();
|
||||
assert!(distinct_indices.insert(idx));
|
||||
}
|
||||
|
||||
const LOWER_6: u32 = (1 << 6) - 1;
|
||||
|
||||
let canonicalized_words = canonicalized_words
|
||||
.into_iter()
|
||||
.map(|v| {
|
||||
(
|
||||
u8::try_from(v.0).unwrap(),
|
||||
match v.1 {
|
||||
Mapping::RotateAndInvert(amount) => {
|
||||
assert_eq!(amount, amount & LOWER_6);
|
||||
1 << 6 | (amount as u8)
|
||||
}
|
||||
Mapping::Rotate(amount) => {
|
||||
assert_eq!(amount, amount & LOWER_6);
|
||||
amount as u8
|
||||
}
|
||||
Mapping::Invert => 1 << 6,
|
||||
Mapping::ShiftRight(shift_by) => {
|
||||
assert_eq!(shift_by, shift_by & LOWER_6);
|
||||
1 << 7 | (shift_by as u8)
|
||||
}
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect::<Vec<(u8, u8)>>();
|
||||
Canonicalized { unique_mapping, canonical_words, canonicalized_words }
|
||||
}
|
||||
}
|
||||
|
98
src/tools/unicode-table-generator/src/skiplist.rs
Normal file
98
src/tools/unicode-table-generator/src/skiplist.rs
Normal file
@ -0,0 +1,98 @@
|
||||
use crate::fmt_list;
|
||||
use crate::raw_emitter::RawEmitter;
|
||||
use std::convert::TryInto;
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
|
||||
/// This will get packed into a single u32 before inserting into the data set.
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct ShortOffsetRunHeader {
|
||||
/// Note, we only allow for 21 bits here.
|
||||
prefix_sum: u32,
|
||||
|
||||
/// Note, we actually only allow for 11 bits here. This should be enough --
|
||||
/// our largest sets are around ~1400 offsets long.
|
||||
start_idx: u16,
|
||||
}
|
||||
|
||||
impl ShortOffsetRunHeader {
|
||||
fn pack(&self) -> u32 {
|
||||
assert!(self.start_idx < (1 << 11));
|
||||
assert!(self.prefix_sum < (1 << 21));
|
||||
|
||||
(self.start_idx as u32) << 21 | self.prefix_sum
|
||||
}
|
||||
}
|
||||
|
||||
impl RawEmitter {
|
||||
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
|
||||
let mut offsets = Vec::<u32>::new();
|
||||
let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
|
||||
let mut offset = 0;
|
||||
for pt in points {
|
||||
let delta = pt - offset;
|
||||
offsets.push(delta);
|
||||
offset = pt;
|
||||
}
|
||||
// Guaranteed to terminate, as it's impossible to subtract a value this
|
||||
// large from a valid char.
|
||||
offsets.push(std::char::MAX as u32 + 1);
|
||||
let mut coded_offsets: Vec<u8> = Vec::new();
|
||||
let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![];
|
||||
let mut iter = offsets.iter().cloned();
|
||||
let mut prefix_sum = 0;
|
||||
loop {
|
||||
let mut any_elements = false;
|
||||
let mut inserted = false;
|
||||
let start = coded_offsets.len();
|
||||
for offset in iter.by_ref() {
|
||||
any_elements = true;
|
||||
prefix_sum += offset;
|
||||
if let Ok(offset) = offset.try_into() {
|
||||
coded_offsets.push(offset);
|
||||
} else {
|
||||
short_offset_runs.push(ShortOffsetRunHeader {
|
||||
start_idx: start.try_into().unwrap(),
|
||||
prefix_sum,
|
||||
});
|
||||
// This is just needed to maintain indices even/odd
|
||||
// correctly.
|
||||
coded_offsets.push(0);
|
||||
inserted = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !any_elements {
|
||||
break;
|
||||
}
|
||||
// We always append the huge char::MAX offset to the end which
|
||||
// should never be able to fit into the u8 offsets.
|
||||
assert!(inserted);
|
||||
}
|
||||
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static SHORT_OFFSET_RUNS: [u32; {}] = [{}];",
|
||||
short_offset_runs.len(),
|
||||
fmt_list(short_offset_runs.iter().map(|v| v.pack()))
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += 4 * short_offset_runs.len();
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static OFFSETS: [u8; {}] = [{}];",
|
||||
coded_offsets.len(),
|
||||
fmt_list(&coded_offsets)
|
||||
)
|
||||
.unwrap();
|
||||
self.bytes_used += coded_offsets.len();
|
||||
|
||||
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||
writeln!(&mut self.file, " super::skip_search(",).unwrap();
|
||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();
|
||||
writeln!(&mut self.file, " &OFFSETS,").unwrap();
|
||||
writeln!(&mut self.file, " )").unwrap();
|
||||
writeln!(&mut self.file, "}}").unwrap();
|
||||
}
|
||||
}
|
@ -11,10 +11,15 @@ static RESOURCES: &[&str] =
|
||||
|
||||
pub fn fetch_latest() {
|
||||
let directory = Path::new(UNICODE_DIRECTORY);
|
||||
if directory.exists() {
|
||||
eprintln!(
|
||||
"Not refetching unicode data, already exists, please delete {:?} to regenerate",
|
||||
directory
|
||||
);
|
||||
return;
|
||||
}
|
||||
if let Err(e) = std::fs::create_dir_all(directory) {
|
||||
if e.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
|
||||
}
|
||||
panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
|
||||
}
|
||||
let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
|
||||
if !output.status.success() {
|
||||
|
Loading…
Reference in New Issue
Block a user