mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-30 02:33:55 +00:00
Replace old tables with new unicode data
This commit is contained in:
parent
40ad877851
commit
efcda04739
@ -3,7 +3,7 @@
|
||||
use crate::slice;
|
||||
use crate::str::from_utf8_unchecked_mut;
|
||||
use crate::unicode::printable::is_printable;
|
||||
use crate::unicode::tables::{conversions, derived_property, general_category, property};
|
||||
use crate::unicode::{self, conversions};
|
||||
|
||||
use super::*;
|
||||
|
||||
@ -552,7 +552,7 @@ impl char {
|
||||
pub fn is_alphabetic(self) -> bool {
|
||||
match self {
|
||||
'a'..='z' | 'A'..='Z' => true,
|
||||
c => c > '\x7f' && derived_property::Alphabetic(c),
|
||||
c => c > '\x7f' && unicode::Alphabetic(c),
|
||||
}
|
||||
}
|
||||
|
||||
@ -583,7 +583,7 @@ impl char {
|
||||
pub fn is_lowercase(self) -> bool {
|
||||
match self {
|
||||
'a'..='z' => true,
|
||||
c => c > '\x7f' && derived_property::Lowercase(c),
|
||||
c => c > '\x7f' && unicode::Lowercase(c),
|
||||
}
|
||||
}
|
||||
|
||||
@ -614,7 +614,7 @@ impl char {
|
||||
pub fn is_uppercase(self) -> bool {
|
||||
match self {
|
||||
'A'..='Z' => true,
|
||||
c => c > '\x7f' && derived_property::Uppercase(c),
|
||||
c => c > '\x7f' && unicode::Uppercase(c),
|
||||
}
|
||||
}
|
||||
|
||||
@ -642,7 +642,7 @@ impl char {
|
||||
pub fn is_whitespace(self) -> bool {
|
||||
match self {
|
||||
' ' | '\x09'..='\x0d' => true,
|
||||
c => c > '\x7f' && property::White_Space(c),
|
||||
c => c > '\x7f' && unicode::White_Space(c),
|
||||
}
|
||||
}
|
||||
|
||||
@ -693,7 +693,7 @@ impl char {
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
#[inline]
|
||||
pub fn is_control(self) -> bool {
|
||||
general_category::Cc(self)
|
||||
unicode::Cc(self)
|
||||
}
|
||||
|
||||
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
|
||||
@ -707,7 +707,7 @@ impl char {
|
||||
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
|
||||
#[inline]
|
||||
pub(crate) fn is_grapheme_extended(self) -> bool {
|
||||
derived_property::Grapheme_Extend(self)
|
||||
unicode::Grapheme_Extend(self)
|
||||
}
|
||||
|
||||
/// Returns `true` if this `char` has one of the general categories for numbers.
|
||||
@ -739,7 +739,7 @@ impl char {
|
||||
pub fn is_numeric(self) -> bool {
|
||||
match self {
|
||||
'0'..='9' => true,
|
||||
c => c > '\x7f' && general_category::N(c),
|
||||
c => c > '\x7f' && unicode::N(c),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,9 +37,9 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
|
||||
|
||||
// unstable re-exports
|
||||
#[unstable(feature = "unicode_version", issue = "49726")]
|
||||
pub use crate::unicode::tables::UNICODE_VERSION;
|
||||
#[unstable(feature = "unicode_version", issue = "49726")]
|
||||
pub use crate::unicode::version::UnicodeVersion;
|
||||
#[unstable(feature = "unicode_version", issue = "49726")]
|
||||
pub use crate::unicode::UNICODE_VERSION;
|
||||
|
||||
use crate::fmt::{self, Write};
|
||||
use crate::iter::FusedIterator;
|
||||
|
@ -1,66 +0,0 @@
|
||||
/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
|
||||
/// implemented with postfix compression (sharing of identical child nodes),
|
||||
/// which gives both compact size and fast lookup.
|
||||
///
|
||||
/// The space of Unicode codepoints is divided into 3 subareas, each
|
||||
/// represented by a trie with different depth. In the first (0..0x800), there
|
||||
/// is no trie structure at all; each u64 entry corresponds to a bitvector
|
||||
/// effectively holding 64 bool values.
|
||||
///
|
||||
/// In the second (0x800..0x10000), each child of the root node represents a
|
||||
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
|
||||
/// the trie stores an 8-bit index into a shared table of leaf values. This
|
||||
/// exploits the fact that in reasonable sets, many such leaves can be shared.
|
||||
///
|
||||
/// In the third (0x10000..0x110000), each child of the root node represents a
|
||||
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
|
||||
/// of a child tree. Each of these 64 bytes represents an index into the table
|
||||
/// of shared 64-bit leaf values. This exploits the sparse structure in the
|
||||
/// non-BMP range of most Unicode sets.
|
||||
pub struct BoolTrie {
|
||||
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
|
||||
pub r1: [u64; 32], // leaves
|
||||
|
||||
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
|
||||
pub r2: [u8; 992], // first level
|
||||
pub r3: &'static [u64], // leaves
|
||||
|
||||
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
|
||||
pub r4: [u8; 256], // first level
|
||||
pub r5: &'static [u8], // second level
|
||||
pub r6: &'static [u64], // leaves
|
||||
}
|
||||
impl BoolTrie {
|
||||
pub fn lookup(&self, c: char) -> bool {
|
||||
let c = c as u32;
|
||||
if c < 0x800 {
|
||||
trie_range_leaf(c, self.r1[(c >> 6) as usize])
|
||||
} else if c < 0x10000 {
|
||||
let child = self.r2[(c >> 6) as usize - 0x20];
|
||||
trie_range_leaf(c, self.r3[child as usize])
|
||||
} else {
|
||||
let child = self.r4[(c >> 12) as usize - 0x10];
|
||||
let leaf = self.r5[((child as usize) << 6) + ((c >> 6) as usize & 0x3f)];
|
||||
trie_range_leaf(c, self.r6[leaf as usize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SmallBoolTrie {
|
||||
pub(crate) r1: &'static [u8], // first level
|
||||
pub(crate) r2: &'static [u64], // leaves
|
||||
}
|
||||
|
||||
impl SmallBoolTrie {
|
||||
pub fn lookup(&self, c: char) -> bool {
|
||||
let c = c as u32;
|
||||
match self.r1.get((c >> 6) as usize) {
|
||||
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn trie_range_leaf(c: u32, bitmap_chunk: u64) -> bool {
|
||||
((bitmap_chunk >> (c & 63)) & 1) != 0
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,878 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Regenerate Unicode tables (tables.rs).
|
||||
"""
|
||||
|
||||
# This script uses the Unicode tables as defined
|
||||
# in the UnicodeFiles class.
|
||||
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the tables.rs file into git.
|
||||
|
||||
# Note that the "curl" program is required for operation.
|
||||
# This script is compatible with Python 2.7 and 3.x.
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import fileinput
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import textwrap
|
||||
import subprocess
|
||||
|
||||
from collections import defaultdict, namedtuple
|
||||
|
||||
try:
|
||||
# Python 3
|
||||
from itertools import zip_longest
|
||||
from io import StringIO
|
||||
except ImportError:
|
||||
# Python 2 compatibility
|
||||
zip_longest = itertools.izip_longest
|
||||
from StringIO import StringIO
|
||||
|
||||
try:
|
||||
# Completely optional type hinting
|
||||
# (Python 2 compatible using comments,
|
||||
# see: https://mypy.readthedocs.io/en/latest/python2.html)
|
||||
# This is very helpful in typing-aware IDE like PyCharm.
|
||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
# We don't use enum.Enum because of Python 2.7 compatibility.
|
||||
class UnicodeFiles(object):
|
||||
# ReadMe does not contain any Unicode data, we
|
||||
# only use it to extract versions.
|
||||
README = "ReadMe.txt"
|
||||
|
||||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
|
||||
DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
|
||||
PROPS = "PropList.txt"
|
||||
SCRIPTS = "Scripts.txt"
|
||||
SPECIAL_CASING = "SpecialCasing.txt"
|
||||
UNICODE_DATA = "UnicodeData.txt"
|
||||
|
||||
|
||||
# The order doesn't really matter (Python < 3.6 won't preserve it),
|
||||
# we only want to aggregate all the file names.
|
||||
ALL_UNICODE_FILES = tuple(
|
||||
value for name, value in UnicodeFiles.__dict__.items()
|
||||
if not name.startswith("_")
|
||||
)
|
||||
|
||||
assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
|
||||
|
||||
# The directory this file is located in.
|
||||
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Where to download the Unicode data. The downloaded files
|
||||
# will be placed in sub-directories named after Unicode version.
|
||||
FETCH_DIR = os.path.join(THIS_DIR, "downloaded")
|
||||
|
||||
FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}"
|
||||
FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}"
|
||||
|
||||
PREAMBLE = """\
|
||||
// NOTE: The following code was generated by "./unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_docs, non_upper_case_globals, non_snake_case, clippy::unreadable_literal)]
|
||||
|
||||
use crate::unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
|
||||
use crate::unicode::version::UnicodeVersion;
|
||||
""".format(year=datetime.datetime.now().year)
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
EXPANDED_CATEGORIES = {
|
||||
"Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"],
|
||||
"Lm": ["L"], "Lo": ["L"],
|
||||
"Mn": ["M"], "Mc": ["M"], "Me": ["M"],
|
||||
"Nd": ["N"], "Nl": ["N"], "No": ["N"],
|
||||
"Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"],
|
||||
"Pi": ["P"], "Pf": ["P"], "Po": ["P"],
|
||||
"Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"],
|
||||
"Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"],
|
||||
"Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
|
||||
}
|
||||
|
||||
# This is the (inclusive) range of surrogate codepoints.
|
||||
# These are not valid Rust characters.
|
||||
SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
|
||||
|
||||
UnicodeData = namedtuple(
|
||||
"UnicodeData", (
|
||||
# Conversions:
|
||||
"to_upper", "to_lower", "to_title",
|
||||
|
||||
# Decompositions: canonical decompositions, compatibility decomp
|
||||
"canon_decomp", "compat_decomp",
|
||||
|
||||
# Grouped: general categories and combining characters
|
||||
"general_categories", "combines",
|
||||
)
|
||||
)
|
||||
|
||||
UnicodeVersion = namedtuple(
|
||||
"UnicodeVersion", ("major", "minor", "micro", "as_str")
|
||||
)
|
||||
|
||||
|
||||
def fetch_files(version=None):
|
||||
# type: (str) -> UnicodeVersion
|
||||
"""
|
||||
Fetch all the Unicode files from unicode.org.
|
||||
|
||||
This will use cached files (stored in `FETCH_DIR`) if they exist,
|
||||
creating them if they don't. In any case, the Unicode version
|
||||
is always returned.
|
||||
|
||||
:param version: The desired Unicode version, as string.
|
||||
(If None, defaults to latest final release available,
|
||||
querying the unicode.org service).
|
||||
"""
|
||||
have_version = check_stored_version(version)
|
||||
if have_version:
|
||||
return have_version
|
||||
|
||||
if version:
|
||||
# Check if the desired version exists on the server.
|
||||
get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
|
||||
else:
|
||||
# Extract the latest version.
|
||||
get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
|
||||
|
||||
readme_url = get_fetch_url(UnicodeFiles.README)
|
||||
|
||||
print("Fetching: {}".format(readme_url))
|
||||
readme_content = subprocess.check_output(("curl", readme_url))
|
||||
|
||||
unicode_version = parse_readme_unicode_version(
|
||||
readme_content.decode("utf8")
|
||||
)
|
||||
|
||||
download_dir = get_unicode_dir(unicode_version)
|
||||
if not os.path.exists(download_dir):
|
||||
# For 2.7 compat, we don't use `exist_ok=True`.
|
||||
os.makedirs(download_dir)
|
||||
|
||||
for filename in ALL_UNICODE_FILES:
|
||||
file_path = get_unicode_file_path(unicode_version, filename)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
# Assume file on the server didn't change if it's been saved before.
|
||||
continue
|
||||
|
||||
if filename == UnicodeFiles.README:
|
||||
with open(file_path, "wb") as fd:
|
||||
fd.write(readme_content)
|
||||
else:
|
||||
url = get_fetch_url(filename)
|
||||
print("Fetching: {}".format(url))
|
||||
subprocess.check_call(("curl", "-o", file_path, url))
|
||||
|
||||
return unicode_version
|
||||
|
||||
|
||||
def check_stored_version(version):
|
||||
# type: (Optional[str]) -> Optional[UnicodeVersion]
|
||||
"""
|
||||
Given desired Unicode version, return the version
|
||||
if stored files are all present, and `None` otherwise.
|
||||
"""
|
||||
if not version:
|
||||
# If no desired version specified, we should check what's the latest
|
||||
# version, skipping stored version checks.
|
||||
return None
|
||||
|
||||
fetch_dir = os.path.join(FETCH_DIR, version)
|
||||
|
||||
for filename in ALL_UNICODE_FILES:
|
||||
file_path = os.path.join(fetch_dir, filename)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
return None
|
||||
|
||||
with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
|
||||
return parse_readme_unicode_version(fd.read())
|
||||
|
||||
|
||||
def parse_readme_unicode_version(readme_content):
|
||||
# type: (str) -> UnicodeVersion
|
||||
"""
|
||||
Parse the Unicode version contained in their `ReadMe.txt` file.
|
||||
"""
|
||||
# "Raw string" is necessary for \d not being treated as escape char
|
||||
# (for the sake of compat with future Python versions).
|
||||
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
|
||||
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
|
||||
groups = re.search(pattern, readme_content).groups()
|
||||
|
||||
return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
|
||||
|
||||
|
||||
def get_unicode_dir(unicode_version):
|
||||
# type: (UnicodeVersion) -> str
|
||||
"""
|
||||
Indicate in which parent dir the Unicode data files should be stored.
|
||||
|
||||
This returns a full, absolute path.
|
||||
"""
|
||||
return os.path.join(FETCH_DIR, unicode_version.as_str)
|
||||
|
||||
|
||||
def get_unicode_file_path(unicode_version, filename):
|
||||
# type: (UnicodeVersion, str) -> str
|
||||
"""
|
||||
Indicate where the Unicode data file should be stored.
|
||||
"""
|
||||
return os.path.join(get_unicode_dir(unicode_version), filename)
|
||||
|
||||
|
||||
def is_surrogate(n):
|
||||
# type: (int) -> bool
|
||||
"""
|
||||
Tell if given codepoint is a surrogate (not a valid Rust character).
|
||||
"""
|
||||
return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1]
|
||||
|
||||
|
||||
def load_unicode_data(file_path):
|
||||
# type: (str) -> UnicodeData
|
||||
"""
|
||||
Load main Unicode data.
|
||||
"""
|
||||
# Conversions
|
||||
to_lower = {} # type: Dict[int, Tuple[int, int, int]]
|
||||
to_upper = {} # type: Dict[int, Tuple[int, int, int]]
|
||||
to_title = {} # type: Dict[int, Tuple[int, int, int]]
|
||||
|
||||
# Decompositions
|
||||
compat_decomp = {} # type: Dict[int, List[int]]
|
||||
canon_decomp = {} # type: Dict[int, List[int]]
|
||||
|
||||
# Combining characters
|
||||
# FIXME: combines are not used
|
||||
combines = defaultdict(set) # type: Dict[str, Set[int]]
|
||||
|
||||
# Categories
|
||||
general_categories = defaultdict(set) # type: Dict[str, Set[int]]
|
||||
category_assigned_codepoints = set() # type: Set[int]
|
||||
|
||||
all_codepoints = {}
|
||||
|
||||
range_start = -1
|
||||
|
||||
for line in fileinput.input(file_path):
|
||||
data = line.split(";")
|
||||
if len(data) != 15:
|
||||
continue
|
||||
codepoint = int(data[0], 16)
|
||||
if is_surrogate(codepoint):
|
||||
continue
|
||||
if range_start >= 0:
|
||||
for i in range(range_start, codepoint):
|
||||
all_codepoints[i] = data
|
||||
range_start = -1
|
||||
if data[1].endswith(", First>"):
|
||||
range_start = codepoint
|
||||
continue
|
||||
all_codepoints[codepoint] = data
|
||||
|
||||
for code, data in all_codepoints.items():
|
||||
(code_org, name, gencat, combine, bidi,
|
||||
decomp, deci, digit, num, mirror,
|
||||
old, iso, upcase, lowcase, titlecase) = data
|
||||
|
||||
# Generate char to char direct common and simple conversions:
|
||||
|
||||
# Uppercase to lowercase
|
||||
if lowcase != "" and code_org != lowcase:
|
||||
to_lower[code] = (int(lowcase, 16), 0, 0)
|
||||
|
||||
# Lowercase to uppercase
|
||||
if upcase != "" and code_org != upcase:
|
||||
to_upper[code] = (int(upcase, 16), 0, 0)
|
||||
|
||||
# Title case
|
||||
if titlecase.strip() != "" and code_org != titlecase:
|
||||
to_title[code] = (int(titlecase, 16), 0, 0)
|
||||
|
||||
# Store decomposition, if given
|
||||
if decomp:
|
||||
decompositions = decomp.split()[1:]
|
||||
decomp_code_points = [int(i, 16) for i in decompositions]
|
||||
|
||||
if decomp.startswith("<"):
|
||||
# Compatibility decomposition
|
||||
compat_decomp[code] = decomp_code_points
|
||||
else:
|
||||
# Canonical decomposition
|
||||
canon_decomp[code] = decomp_code_points
|
||||
|
||||
# Place letter in categories as appropriate.
|
||||
for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
|
||||
general_categories[cat].add(code)
|
||||
category_assigned_codepoints.add(code)
|
||||
|
||||
# Record combining class, if any.
|
||||
if combine != "0":
|
||||
combines[combine].add(code)
|
||||
|
||||
# Generate Not_Assigned from Assigned.
|
||||
general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
|
||||
|
||||
# Other contains Not_Assigned
|
||||
general_categories["C"].update(general_categories["Cn"])
|
||||
|
||||
grouped_categories = group_categories(general_categories)
|
||||
|
||||
# FIXME: combines are not used
|
||||
return UnicodeData(
|
||||
to_lower=to_lower, to_upper=to_upper, to_title=to_title,
|
||||
compat_decomp=compat_decomp, canon_decomp=canon_decomp,
|
||||
general_categories=grouped_categories, combines=combines,
|
||||
)
|
||||
|
||||
|
||||
def load_special_casing(file_path, unicode_data):
|
||||
# type: (str, UnicodeData) -> None
|
||||
"""
|
||||
Load special casing data and enrich given Unicode data.
|
||||
"""
|
||||
for line in fileinput.input(file_path):
|
||||
data = line.split("#")[0].split(";")
|
||||
if len(data) == 5:
|
||||
code, lower, title, upper, _comment = data
|
||||
elif len(data) == 6:
|
||||
code, lower, title, upper, condition, _comment = data
|
||||
if condition.strip(): # Only keep unconditional mappins
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
code = code.strip()
|
||||
lower = lower.strip()
|
||||
title = title.strip()
|
||||
upper = upper.strip()
|
||||
key = int(code, 16)
|
||||
for (map_, values) in ((unicode_data.to_lower, lower),
|
||||
(unicode_data.to_upper, upper),
|
||||
(unicode_data.to_title, title)):
|
||||
if values != code:
|
||||
split = values.split()
|
||||
|
||||
codepoints = list(itertools.chain(
|
||||
(int(i, 16) for i in split),
|
||||
(0 for _ in range(len(split), 3))
|
||||
))
|
||||
|
||||
assert len(codepoints) == 3
|
||||
map_[key] = codepoints
|
||||
|
||||
|
||||
def group_categories(mapping):
|
||||
# type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]]
|
||||
"""
|
||||
Group codepoints mapped in "categories".
|
||||
"""
|
||||
return {category: group_codepoints(codepoints)
|
||||
for category, codepoints in mapping.items()}
|
||||
|
||||
|
||||
def group_codepoints(codepoints):
|
||||
# type: (Iterable[int]) -> List[Tuple[int, int]]
|
||||
"""
|
||||
Group integral values into continuous, disjoint value ranges.
|
||||
|
||||
Performs value deduplication.
|
||||
|
||||
:return: sorted list of pairs denoting start and end of codepoint
|
||||
group values, both ends inclusive.
|
||||
|
||||
>>> group_codepoints([1, 2, 10, 11, 12, 3, 4])
|
||||
[(1, 4), (10, 12)]
|
||||
>>> group_codepoints([1])
|
||||
[(1, 1)]
|
||||
>>> group_codepoints([1, 5, 6])
|
||||
[(1, 1), (5, 6)]
|
||||
>>> group_codepoints([])
|
||||
[]
|
||||
"""
|
||||
sorted_codes = sorted(set(codepoints))
|
||||
result = [] # type: List[Tuple[int, int]]
|
||||
|
||||
if not sorted_codes:
|
||||
return result
|
||||
|
||||
next_codes = sorted_codes[1:]
|
||||
start_code = sorted_codes[0]
|
||||
|
||||
for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None):
|
||||
if next_code is None or next_code - code != 1:
|
||||
result.append((start_code, code))
|
||||
start_code = next_code
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def ungroup_codepoints(codepoint_pairs):
|
||||
# type: (Iterable[Tuple[int, int]]) -> List[int]
|
||||
"""
|
||||
The inverse of group_codepoints -- produce a flat list of values
|
||||
from value range pairs.
|
||||
|
||||
>>> ungroup_codepoints([(1, 4), (10, 12)])
|
||||
[1, 2, 3, 4, 10, 11, 12]
|
||||
>>> ungroup_codepoints([(1, 1), (5, 6)])
|
||||
[1, 5, 6]
|
||||
>>> ungroup_codepoints(group_codepoints([1, 2, 7, 8]))
|
||||
[1, 2, 7, 8]
|
||||
>>> ungroup_codepoints([])
|
||||
[]
|
||||
"""
|
||||
return list(itertools.chain.from_iterable(
|
||||
range(lo, hi + 1) for lo, hi in codepoint_pairs
|
||||
))
|
||||
|
||||
|
||||
def get_unassigned_codepoints(assigned_codepoints):
|
||||
# type: (Set[int]) -> Set[int]
|
||||
"""
|
||||
Given a set of "assigned" codepoints, return a set
|
||||
of these that are not in assigned and not surrogate.
|
||||
"""
|
||||
return {i for i in range(0, 0x110000)
|
||||
if i not in assigned_codepoints and not is_surrogate(i)}
|
||||
|
||||
|
||||
def generate_table_lines(items, indent, wrap=98):
|
||||
# type: (Iterable[str], int, int) -> Iterator[str]
|
||||
"""
|
||||
Given table items, generate wrapped lines of text with comma-separated items.
|
||||
|
||||
This is a generator function.
|
||||
|
||||
:param wrap: soft wrap limit (characters per line), integer.
|
||||
"""
|
||||
line = " " * indent
|
||||
first = True
|
||||
for item in items:
|
||||
if len(line) + len(item) < wrap:
|
||||
if first:
|
||||
line += item
|
||||
else:
|
||||
line += ", " + item
|
||||
first = False
|
||||
else:
|
||||
yield line + ",\n"
|
||||
line = " " * indent + item
|
||||
|
||||
yield line
|
||||
|
||||
|
||||
def load_properties(file_path, interesting_props):
|
||||
# type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]]
|
||||
"""
|
||||
Load properties data and return in grouped form.
|
||||
"""
|
||||
props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]]
|
||||
# "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
|
||||
# (for the sake of compat with future Python versions).
|
||||
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
|
||||
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
|
||||
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
|
||||
|
||||
for line in fileinput.input(file_path):
|
||||
match = re1.match(line) or re2.match(line)
|
||||
if match:
|
||||
groups = match.groups()
|
||||
|
||||
if len(groups) == 2:
|
||||
# `re1` matched (2 groups).
|
||||
d_lo, prop = groups
|
||||
d_hi = d_lo
|
||||
else:
|
||||
d_lo, d_hi, prop = groups
|
||||
else:
|
||||
continue
|
||||
|
||||
if interesting_props and prop not in interesting_props:
|
||||
continue
|
||||
|
||||
lo_value = int(d_lo, 16)
|
||||
hi_value = int(d_hi, 16)
|
||||
|
||||
props[prop].append((lo_value, hi_value))
|
||||
|
||||
# Optimize if possible.
|
||||
for prop in props:
|
||||
props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
|
||||
|
||||
return props
|
||||
|
||||
|
||||
def escape_char(c):
|
||||
# type: (int) -> str
|
||||
r"""
|
||||
Escape a codepoint for use as Rust char literal.
|
||||
|
||||
Outputs are OK to use as Rust source code as char literals
|
||||
and they also include necessary quotes.
|
||||
|
||||
>>> escape_char(97)
|
||||
"'\\u{61}'"
|
||||
>>> escape_char(0)
|
||||
"'\\0'"
|
||||
"""
|
||||
return r"'\u{%x}'" % c if c != 0 else r"'\0'"
|
||||
|
||||
|
||||
def format_char_pair(pair):
|
||||
# type: (Tuple[int, int]) -> str
|
||||
"""
|
||||
Format a pair of two Rust chars.
|
||||
"""
|
||||
return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
|
||||
|
||||
|
||||
def generate_table(
|
||||
name, # type: str
|
||||
items, # type: List[Tuple[int, int]]
|
||||
decl_type="&[(char, char)]", # type: str
|
||||
is_pub=True, # type: bool
|
||||
format_item=format_char_pair, # type: Callable[[Tuple[int, int]], str]
|
||||
):
|
||||
# type: (...) -> Iterator[str]
|
||||
"""
|
||||
Generate a nicely formatted Rust constant "table" array.
|
||||
|
||||
This generates actual Rust code.
|
||||
"""
|
||||
pub_string = ""
|
||||
if is_pub:
|
||||
pub_string = "pub "
|
||||
|
||||
yield "\n"
|
||||
yield " #[rustfmt::skip]\n"
|
||||
yield " %sconst %s: %s = &[\n" % (pub_string, name, decl_type)
|
||||
|
||||
data = []
|
||||
first = True
|
||||
for item in items:
|
||||
if not first:
|
||||
data.append(",")
|
||||
first = False
|
||||
data.extend(format_item(item))
|
||||
|
||||
for table_line in generate_table_lines("".join(data).split(","), 8):
|
||||
yield table_line
|
||||
|
||||
yield "\n ];\n"
|
||||
|
||||
|
||||
def compute_trie(raw_data, chunk_size):
|
||||
# type: (List[int], int) -> Tuple[List[int], List[int]]
|
||||
"""
|
||||
Compute postfix-compressed trie.
|
||||
|
||||
See: bool_trie.rs for more details.
|
||||
|
||||
>>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3)
|
||||
([0, 0, 1], [1, 2, 3, 4, 5, 6])
|
||||
>>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3)
|
||||
([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6])
|
||||
"""
|
||||
root = []
|
||||
childmap = {} # type: Dict[Tuple[int, ...], int]
|
||||
child_data = []
|
||||
|
||||
assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized"
|
||||
|
||||
for i in range(len(raw_data) // chunk_size):
|
||||
data = raw_data[i * chunk_size : (i + 1) * chunk_size]
|
||||
|
||||
# Postfix compression of child nodes (data chunks)
|
||||
# (identical child nodes are shared).
|
||||
|
||||
# Make a tuple out of the list so it's hashable.
|
||||
child = tuple(data)
|
||||
if child not in childmap:
|
||||
childmap[child] = len(childmap)
|
||||
child_data.extend(data)
|
||||
|
||||
root.append(childmap[child])
|
||||
|
||||
return root, child_data
|
||||
|
||||
|
||||
def generate_bool_trie(name, codepoint_ranges, is_pub=False):
|
||||
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
|
||||
"""
|
||||
Generate Rust code for BoolTrie struct.
|
||||
|
||||
This yields string fragments that should be joined to produce
|
||||
the final string.
|
||||
|
||||
See: `bool_trie.rs`.
|
||||
"""
|
||||
chunk_size = 64
|
||||
rawdata = [False] * 0x110000
|
||||
for (lo, hi) in codepoint_ranges:
|
||||
for cp in range(lo, hi + 1):
|
||||
rawdata[cp] = True
|
||||
|
||||
# Convert to bitmap chunks of `chunk_size` bits each.
|
||||
chunks = []
|
||||
for i in range(0x110000 // chunk_size):
|
||||
chunk = 0
|
||||
for j in range(chunk_size):
|
||||
if rawdata[i * chunk_size + j]:
|
||||
chunk |= 1 << j
|
||||
chunks.append(chunk)
|
||||
|
||||
pub_string = ""
|
||||
if is_pub:
|
||||
pub_string = "pub "
|
||||
|
||||
yield "\n"
|
||||
yield " #[rustfmt::skip]\n"
|
||||
yield " %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)
|
||||
yield " r1: [\n"
|
||||
data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size])
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
# 0x800..0x10000 trie
|
||||
(r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
|
||||
yield " r2: [\n"
|
||||
data = map(str, r2)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " r3: &[\n"
|
||||
data = ("0x%016x" % node for node in r3)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
# 0x10000..0x110000 trie
|
||||
(mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size],
|
||||
64 // chunk_size)
|
||||
(r4, r5) = compute_trie(mid, 64)
|
||||
|
||||
yield " r4: [\n"
|
||||
data = map(str, r4)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " r5: &[\n"
|
||||
data = map(str, r5)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " r6: &[\n"
|
||||
data = ("0x%016x" % node for node in r6)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " };\n"
|
||||
|
||||
|
||||
def generate_small_bool_trie(name, codepoint_ranges, is_pub=False):
|
||||
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
|
||||
"""
|
||||
Generate Rust code for `SmallBoolTrie` struct.
|
||||
|
||||
See: `bool_trie.rs`.
|
||||
"""
|
||||
last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
|
||||
n_chunks = last_chunk + 1
|
||||
chunks = [0] * n_chunks
|
||||
for (lo, hi) in codepoint_ranges:
|
||||
for cp in range(lo, hi + 1):
|
||||
assert cp // 64 < len(chunks)
|
||||
chunks[cp // 64] |= 1 << (cp & 63)
|
||||
|
||||
pub_string = ""
|
||||
if is_pub:
|
||||
pub_string = "pub "
|
||||
|
||||
yield "\n"
|
||||
yield " #[rustfmt::skip]\n"
|
||||
yield (" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
|
||||
% (pub_string, name))
|
||||
|
||||
(r1, r2) = compute_trie(chunks, 1)
|
||||
|
||||
yield " r1: &[\n"
|
||||
data = (str(node) for node in r1)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " r2: &[\n"
|
||||
data = ("0x%016x" % node for node in r2)
|
||||
for fragment in generate_table_lines(data, 12):
|
||||
yield fragment
|
||||
yield "\n ],\n"
|
||||
|
||||
yield " };\n"
|
||||
|
||||
|
||||
def generate_property_module(mod, grouped_categories, category_subset):
|
||||
# type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str]
|
||||
"""
|
||||
Generate Rust code for module defining properties.
|
||||
"""
|
||||
|
||||
yield "pub(crate) mod %s {" % mod
|
||||
for cat in sorted(category_subset):
|
||||
if cat in ("Cc", "White_Space"):
|
||||
generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
|
||||
else:
|
||||
generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
|
||||
|
||||
for fragment in generator:
|
||||
yield fragment
|
||||
|
||||
yield "\n"
|
||||
yield " pub fn %s(c: char) -> bool {\n" % cat
|
||||
yield " %s_table.lookup(c)\n" % cat
|
||||
yield " }\n"
|
||||
|
||||
yield "}\n\n"
|
||||
|
||||
|
||||
def generate_conversions_module(unicode_data):
|
||||
# type: (UnicodeData) -> Iterator[str]
|
||||
"""
|
||||
Generate Rust code for module defining conversions.
|
||||
"""
|
||||
|
||||
yield "pub(crate) mod conversions {"
|
||||
yield """
|
||||
pub fn to_lower(c: char) -> [char; 3] {
|
||||
match bsearch_case_table(c, to_lowercase_table) {
|
||||
None => [c, '\\0', '\\0'],
|
||||
Some(index) => to_lowercase_table[index].1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_upper(c: char) -> [char; 3] {
|
||||
match bsearch_case_table(c, to_uppercase_table) {
|
||||
None => [c, '\\0', '\\0'],
|
||||
Some(index) => to_uppercase_table[index].1,
|
||||
}
|
||||
}
|
||||
|
||||
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
|
||||
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
||||
}\n"""
|
||||
|
||||
decl_type = "&[(char, [char; 3])]"
|
||||
format_conversion = lambda x: "({},[{},{},{}])".format(*(
|
||||
escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2])
|
||||
))
|
||||
|
||||
for fragment in generate_table(
|
||||
name="to_lowercase_table",
|
||||
items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]),
|
||||
decl_type=decl_type,
|
||||
is_pub=False,
|
||||
format_item=format_conversion
|
||||
):
|
||||
yield fragment
|
||||
|
||||
for fragment in generate_table(
|
||||
name="to_uppercase_table",
|
||||
items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]),
|
||||
decl_type=decl_type,
|
||||
is_pub=False,
|
||||
format_item=format_conversion
|
||||
):
|
||||
yield fragment
|
||||
|
||||
yield "}\n"
|
||||
|
||||
|
||||
def parse_args():
|
||||
# type: () -> argparse.Namespace
|
||||
"""
|
||||
Parse command line arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("-v", "--version", default=None, type=str,
|
||||
help="Unicode version to use (if not specified,"
|
||||
" defaults to latest release).")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
# type: () -> None
|
||||
"""
|
||||
Script entry point.
|
||||
"""
|
||||
args = parse_args()
|
||||
|
||||
unicode_version = fetch_files(args.version)
|
||||
print("Using Unicode version: {}".format(unicode_version.as_str))
|
||||
|
||||
# All the writing happens entirely in memory, we only write to file
|
||||
# once we have generated the file content (it's not very large, <1 MB).
|
||||
buf = StringIO()
|
||||
buf.write(PREAMBLE)
|
||||
|
||||
unicode_version_notice = textwrap.dedent("""
|
||||
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
||||
/// `char` and `str` methods are based on.
|
||||
#[unstable(feature = "unicode_version", issue = "49726")]
|
||||
pub const UNICODE_VERSION: UnicodeVersion =
|
||||
UnicodeVersion {{ major: {v.major}, minor: {v.minor}, micro: {v.micro}, _priv: () }};
|
||||
""").format(v=unicode_version)
|
||||
buf.write(unicode_version_notice)
|
||||
|
||||
get_path = lambda f: get_unicode_file_path(unicode_version, f)
|
||||
|
||||
unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
|
||||
load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
|
||||
|
||||
want_derived = {"Alphabetic", "Lowercase", "Uppercase",
|
||||
"Cased", "Case_Ignorable", "Grapheme_Extend"}
|
||||
derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
|
||||
|
||||
props = load_properties(get_path(UnicodeFiles.PROPS),
|
||||
{"White_Space", "Join_Control", "Noncharacter_Code_Point"})
|
||||
|
||||
# Category tables
|
||||
for (name, categories, category_subset) in (
|
||||
("general_category", unicode_data.general_categories, ["N", "Cc"]),
|
||||
("derived_property", derived, want_derived),
|
||||
("property", props, ["White_Space"])
|
||||
):
|
||||
for fragment in generate_property_module(name, categories, category_subset):
|
||||
buf.write(fragment)
|
||||
|
||||
for fragment in generate_conversions_module(unicode_data):
|
||||
buf.write(fragment)
|
||||
|
||||
tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
|
||||
|
||||
# Actually write out the file content.
|
||||
# Will overwrite the file if it exists.
|
||||
with open(tables_rs_path, "w") as fd:
|
||||
fd.write(buf.getvalue())
|
||||
|
||||
print("Regenerated tables.rs.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2343
src/libcore/unicode/unicode_data.rs
Normal file
2343
src/libcore/unicode/unicode_data.rs
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user