mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-25 08:13:41 +00:00
Clean up unicode.py script
This commit is contained in:
parent
be1dbaffed
commit
89feb6d5fd
1
.gitignore
vendored
1
.gitignore
vendored
@ -36,6 +36,7 @@ __pycache__/
|
|||||||
/src/libcore/unicode/Scripts.txt
|
/src/libcore/unicode/Scripts.txt
|
||||||
/src/libcore/unicode/SpecialCasing.txt
|
/src/libcore/unicode/SpecialCasing.txt
|
||||||
/src/libcore/unicode/UnicodeData.txt
|
/src/libcore/unicode/UnicodeData.txt
|
||||||
|
/src/libcore/unicode/downloaded
|
||||||
/stage[0-9]+/
|
/stage[0-9]+/
|
||||||
/target
|
/target
|
||||||
target/
|
target/
|
||||||
|
@ -1,35 +1,71 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# This script uses the following Unicode tables:
|
"""
|
||||||
# - DerivedCoreProperties.txt
|
Regenerate Unicode tables (tables.rs).
|
||||||
# - DerivedNormalizationProps.txt
|
"""
|
||||||
# - EastAsianWidth.txt
|
|
||||||
# - auxiliary/GraphemeBreakProperty.txt
|
# This script uses the Unicode tables as defined
|
||||||
# - PropList.txt
|
# in the UnicodeFiles class.
|
||||||
# - ReadMe.txt
|
|
||||||
# - Scripts.txt
|
|
||||||
# - UnicodeData.txt
|
|
||||||
#
|
|
||||||
# Since this should not require frequent updates, we just store this
|
# Since this should not require frequent updates, we just store this
|
||||||
# out-of-line and check the tables.rs file into git.
|
# out-of-line and check the tables.rs file into git.
|
||||||
|
|
||||||
import fileinput, re, os, sys, operator, math, datetime
|
# Note that the "curl" program is required for operation.
|
||||||
|
# This script is compatible with Python 2.7 and 3.x.
|
||||||
|
|
||||||
# The directory in which this file resides.
|
import argparse
|
||||||
fdir = os.path.dirname(os.path.realpath(__file__)) + "/"
|
import datetime
|
||||||
|
import fileinput
|
||||||
|
import operator
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import textwrap
|
||||||
|
import subprocess
|
||||||
|
|
||||||
preamble = '''
|
from collections import namedtuple
|
||||||
|
|
||||||
|
|
||||||
|
# we don't use enum.Enum because of Python 2.7 compatibility
|
||||||
|
class UnicodeFiles(object):
|
||||||
|
# ReadMe does not contain any unicode data, we
|
||||||
|
# use it to extract versions.
|
||||||
|
README = "ReadMe.txt"
|
||||||
|
|
||||||
|
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
|
||||||
|
DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
|
||||||
|
SPECIAL_CASING = "SpecialCasing.txt"
|
||||||
|
SCRIPTS = "Scripts.txt"
|
||||||
|
PROPS = "PropList.txt"
|
||||||
|
UNICODE_DATA = "UnicodeData.txt"
|
||||||
|
|
||||||
|
|
||||||
|
UnicodeFiles.ALL_FILES = tuple(
|
||||||
|
getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
|
||||||
|
if not name.startswith("_")
|
||||||
|
)
|
||||||
|
|
||||||
|
# The directory this file is located in.
|
||||||
|
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Where to download the Unicode data. The downloaded files
|
||||||
|
# will be placed in sub-directories named after Unicode version.
|
||||||
|
FETCH_DIR = os.path.join(THIS_DIR, "downloaded")
|
||||||
|
|
||||||
|
FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}"
|
||||||
|
FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}"
|
||||||
|
|
||||||
|
PREAMBLE = """\
|
||||||
// NOTE: The following code was generated by "./unicode.py", do not edit directly
|
// NOTE: The following code was generated by "./unicode.py", do not edit directly
|
||||||
|
|
||||||
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
||||||
|
|
||||||
use unicode::version::UnicodeVersion;
|
use unicode::version::UnicodeVersion;
|
||||||
use unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
|
use unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
|
||||||
'''.format(year = datetime.datetime.now().year)
|
""".format(year=datetime.datetime.now().year)
|
||||||
|
|
||||||
# Mapping taken from Table 12 from:
|
# Mapping taken from Table 12 from:
|
||||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||||
expanded_categories = {
|
EXPANDED_CATEGORIES = {
|
||||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||||
'Lm': ['L'], 'Lo': ['L'],
|
'Lm': ['L'], 'Lo': ['L'],
|
||||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||||
@ -42,22 +78,101 @@ expanded_categories = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# these are the surrogate codepoints, which are not valid rust characters
|
# these are the surrogate codepoints, which are not valid rust characters
|
||||||
surrogate_codepoints = (0xd800, 0xdfff)
|
SURROGATE_CODEPOINTS = (0xd800, 0xdfff)
|
||||||
|
|
||||||
def fetch(f):
|
UnicodeData = namedtuple(
|
||||||
path = fdir + os.path.basename(f)
|
"UnicodeData", ("canon_decomp", "compat_decomp", "gencats", "combines",
|
||||||
if not os.path.exists(path):
|
"to_upper", "to_lower", "to_title", )
|
||||||
os.system("curl -o {0}{1} ftp://ftp.unicode.org/Public/UNIDATA/{1}".format(fdir, f))
|
)
|
||||||
|
|
||||||
|
UnicodeVersion = namedtuple(
|
||||||
|
"UnicodeVersion", ("major", "minor", "micro", "as_str")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_files(version=None):
|
||||||
|
"""
|
||||||
|
Fetch all the Unicode files from unicode.org
|
||||||
|
|
||||||
|
:param version: The desired Unicode version, as string.
|
||||||
|
(If None, defaults to latest final release available).
|
||||||
|
:return: The version downloaded (UnicodeVersion object).
|
||||||
|
"""
|
||||||
|
have_version = should_skip_fetch(version)
|
||||||
|
if have_version:
|
||||||
|
return have_version
|
||||||
|
|
||||||
|
if version:
|
||||||
|
# check if the desired version exists on the server
|
||||||
|
get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
|
||||||
|
else:
|
||||||
|
# extract the latest version
|
||||||
|
get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
|
||||||
|
|
||||||
|
readme_url = get_fetch_url(UnicodeFiles.README)
|
||||||
|
|
||||||
|
print("Fetching: {}".format(readme_url))
|
||||||
|
readme_content = subprocess.check_output(("curl", readme_url))
|
||||||
|
|
||||||
|
unicode_version = parse_unicode_version(
|
||||||
|
str(readme_content, "utf8")
|
||||||
|
)
|
||||||
|
|
||||||
|
download_dir = os.path.join(FETCH_DIR, unicode_version.as_str)
|
||||||
|
if not os.path.exists(download_dir):
|
||||||
|
# for 2.7 compat, we don't use exist_ok=True
|
||||||
|
os.makedirs(download_dir)
|
||||||
|
|
||||||
|
for filename in UnicodeFiles.ALL_FILES:
|
||||||
|
file_path = os.path.join(download_dir, filename)
|
||||||
|
|
||||||
|
if filename == UnicodeFiles.README:
|
||||||
|
with open(file_path, "wb") as fd:
|
||||||
|
fd.write(readme_content)
|
||||||
|
elif not os.path.exists(file_path):
|
||||||
|
url = get_fetch_url(filename)
|
||||||
|
print("Fetching: {}".format(url))
|
||||||
|
subprocess.check_call(("curl", "-o", file_path, url))
|
||||||
|
|
||||||
|
return unicode_version
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_fetch(version):
|
||||||
|
if not version:
|
||||||
|
# should always check latest version
|
||||||
|
return False
|
||||||
|
|
||||||
|
fetch_dir = os.path.join(FETCH_DIR, version)
|
||||||
|
|
||||||
|
for filename in UnicodeFiles.ALL_FILES:
|
||||||
|
file_path = os.path.join(fetch_dir, filename)
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
|
||||||
|
return parse_unicode_version(fd.read())
|
||||||
|
|
||||||
|
|
||||||
|
def parse_unicode_version(readme_content):
|
||||||
|
# "raw string" is necessary for \d not being treated as escape char
|
||||||
|
# (for the sake of compat with future Python versions)
|
||||||
|
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
|
||||||
|
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
|
||||||
|
groups = re.search(pattern, readme_content).groups()
|
||||||
|
|
||||||
|
return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
|
||||||
|
|
||||||
|
|
||||||
|
def get_unicode_file_path(unicode_version, filename):
|
||||||
|
return os.path.join(FETCH_DIR, unicode_version.as_str, filename)
|
||||||
|
|
||||||
if not os.path.exists(path):
|
|
||||||
sys.stderr.write("cannot load %s" % f)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
def is_surrogate(n):
|
def is_surrogate(n):
|
||||||
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
|
return SURROGATE_CODEPOINTS[0] <= n <= SURROGATE_CODEPOINTS[1]
|
||||||
|
|
||||||
def load_unicode_data(f):
|
|
||||||
fetch(f)
|
def load_unicode_data(file_path):
|
||||||
gencats = {}
|
gencats = {}
|
||||||
to_lower = {}
|
to_lower = {}
|
||||||
to_upper = {}
|
to_upper = {}
|
||||||
@ -68,8 +183,8 @@ def load_unicode_data(f):
|
|||||||
|
|
||||||
udict = {}
|
udict = {}
|
||||||
range_start = -1
|
range_start = -1
|
||||||
for line in fileinput.input(fdir + f):
|
for line in fileinput.input(file_path):
|
||||||
data = line.split(';')
|
data = line.split(";")
|
||||||
if len(data) != 15:
|
if len(data) != 15:
|
||||||
continue
|
continue
|
||||||
cp = int(data[0], 16)
|
cp = int(data[0], 16)
|
||||||
@ -104,7 +219,7 @@ def load_unicode_data(f):
|
|||||||
|
|
||||||
# store decomposition, if given
|
# store decomposition, if given
|
||||||
if decomp != "":
|
if decomp != "":
|
||||||
if decomp.startswith('<'):
|
if decomp.startswith("<"):
|
||||||
seq = []
|
seq = []
|
||||||
for i in decomp.split()[1:]:
|
for i in decomp.split()[1:]:
|
||||||
seq.append(int(i, 16))
|
seq.append(int(i, 16))
|
||||||
@ -116,7 +231,7 @@ def load_unicode_data(f):
|
|||||||
canon_decomp[code] = seq
|
canon_decomp[code] = seq
|
||||||
|
|
||||||
# place letter in categories as appropriate
|
# place letter in categories as appropriate
|
||||||
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
|
for cat in [gencat, "Assigned"] + EXPANDED_CATEGORIES.get(gencat, []):
|
||||||
if cat not in gencats:
|
if cat not in gencats:
|
||||||
gencats[cat] = []
|
gencats[cat] = []
|
||||||
gencats[cat].append(code)
|
gencats[cat].append(code)
|
||||||
@ -136,12 +251,15 @@ def load_unicode_data(f):
|
|||||||
gencats = group_cats(gencats)
|
gencats = group_cats(gencats)
|
||||||
combines = to_combines(group_cats(combines))
|
combines = to_combines(group_cats(combines))
|
||||||
|
|
||||||
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)
|
return UnicodeData(
|
||||||
|
canon_decomp, compat_decomp, gencats, combines, to_upper,
|
||||||
|
to_lower, to_title,
|
||||||
|
)
|
||||||
|
|
||||||
def load_special_casing(f, to_upper, to_lower, to_title):
|
|
||||||
fetch(f)
|
def load_special_casing(file_path, unicode_data):
|
||||||
for line in fileinput.input(fdir + f):
|
for line in fileinput.input(file_path):
|
||||||
data = line.split('#')[0].split(';')
|
data = line.split("#")[0].split(";")
|
||||||
if len(data) == 5:
|
if len(data) == 5:
|
||||||
code, lower, title, upper, _comment = data
|
code, lower, title, upper, _comment = data
|
||||||
elif len(data) == 6:
|
elif len(data) == 6:
|
||||||
@ -155,7 +273,9 @@ def load_special_casing(f, to_upper, to_lower, to_title):
|
|||||||
title = title.strip()
|
title = title.strip()
|
||||||
upper = upper.strip()
|
upper = upper.strip()
|
||||||
key = int(code, 16)
|
key = int(code, 16)
|
||||||
for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
|
for (map_, values) in ((unicode_data.to_lower, lower),
|
||||||
|
(unicode_data.to_upper, upper),
|
||||||
|
(unicode_data.to_title, title)):
|
||||||
if values != code:
|
if values != code:
|
||||||
values = [int(i, 16) for i in values.split()]
|
values = [int(i, 16) for i in values.split()]
|
||||||
for _ in range(len(values), 3):
|
for _ in range(len(values), 3):
|
||||||
@ -163,12 +283,14 @@ def load_special_casing(f, to_upper, to_lower, to_title):
|
|||||||
assert len(values) == 3
|
assert len(values) == 3
|
||||||
map_[key] = values
|
map_[key] = values
|
||||||
|
|
||||||
|
|
||||||
def group_cats(cats):
|
def group_cats(cats):
|
||||||
cats_out = {}
|
cats_out = {}
|
||||||
for cat in cats:
|
for cat in cats:
|
||||||
cats_out[cat] = group_cat(cats[cat])
|
cats_out[cat] = group_cat(cats[cat])
|
||||||
return cats_out
|
return cats_out
|
||||||
|
|
||||||
|
|
||||||
def group_cat(cat):
|
def group_cat(cat):
|
||||||
cat_out = []
|
cat_out = []
|
||||||
letters = sorted(set(cat))
|
letters = sorted(set(cat))
|
||||||
@ -185,6 +307,7 @@ def group_cat(cat):
|
|||||||
cat_out.append((cur_start, cur_end))
|
cat_out.append((cur_start, cur_end))
|
||||||
return cat_out
|
return cat_out
|
||||||
|
|
||||||
|
|
||||||
def ungroup_cat(cat):
|
def ungroup_cat(cat):
|
||||||
cat_out = []
|
cat_out = []
|
||||||
for (lo, hi) in cat:
|
for (lo, hi) in cat:
|
||||||
@ -193,21 +316,24 @@ def ungroup_cat(cat):
|
|||||||
lo += 1
|
lo += 1
|
||||||
return cat_out
|
return cat_out
|
||||||
|
|
||||||
|
|
||||||
def gen_unassigned(assigned):
|
def gen_unassigned(assigned):
|
||||||
assigned = set(assigned)
|
assigned = set(assigned)
|
||||||
return ([i for i in range(0, 0xd800) if i not in assigned] +
|
return ([i for i in range(0, 0xd800) if i not in assigned] +
|
||||||
[i for i in range(0xe000, 0x110000) if i not in assigned])
|
[i for i in range(0xe000, 0x110000) if i not in assigned])
|
||||||
|
|
||||||
|
|
||||||
def to_combines(combs):
|
def to_combines(combs):
|
||||||
combs_out = []
|
combs_out = []
|
||||||
for comb in combs:
|
for comb in combs:
|
||||||
for (lo, hi) in combs[comb]:
|
for (lo, hi) in combs[comb]:
|
||||||
combs_out.append((lo, hi, comb))
|
combs_out.append((lo, hi, comb))
|
||||||
combs_out.sort(key=lambda comb: comb[0])
|
combs_out.sort(key=lambda c: c[0])
|
||||||
return combs_out
|
return combs_out
|
||||||
|
|
||||||
|
|
||||||
def format_table_content(f, content, indent):
|
def format_table_content(f, content, indent):
|
||||||
line = " "*indent
|
line = " " * indent
|
||||||
first = True
|
first = True
|
||||||
for chunk in content.split(","):
|
for chunk in content.split(","):
|
||||||
if len(line) + len(chunk) < 98:
|
if len(line) + len(chunk) < 98:
|
||||||
@ -218,16 +344,19 @@ def format_table_content(f, content, indent):
|
|||||||
first = False
|
first = False
|
||||||
else:
|
else:
|
||||||
f.write(line + ",\n")
|
f.write(line + ",\n")
|
||||||
line = " "*indent + chunk
|
line = " " * indent + chunk
|
||||||
f.write(line)
|
f.write(line)
|
||||||
|
|
||||||
def load_properties(f, interestingprops):
|
|
||||||
fetch(f)
|
|
||||||
props = {}
|
|
||||||
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
|
|
||||||
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
|
|
||||||
|
|
||||||
for line in fileinput.input(fdir + os.path.basename(f)):
|
def load_properties(file_path, interestingprops):
|
||||||
|
props = {}
|
||||||
|
# "raw string" is necessary for \w not to be treated as escape char
|
||||||
|
# (for the sake of compat with future Python versions)
|
||||||
|
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
|
||||||
|
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
|
||||||
|
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
|
||||||
|
|
||||||
|
for line in fileinput.input(file_path):
|
||||||
prop = None
|
prop = None
|
||||||
d_lo = 0
|
d_lo = 0
|
||||||
d_hi = 0
|
d_hi = 0
|
||||||
@ -258,10 +387,12 @@ def load_properties(f, interestingprops):
|
|||||||
|
|
||||||
return props
|
return props
|
||||||
|
|
||||||
|
|
||||||
def escape_char(c):
|
def escape_char(c):
|
||||||
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
|
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
|
||||||
|
|
||||||
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
|
|
||||||
|
def emit_table(f, name, t_data, t_type="&[(char, char)]", is_pub=True,
|
||||||
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
||||||
pub_string = ""
|
pub_string = ""
|
||||||
if is_pub:
|
if is_pub:
|
||||||
@ -277,6 +408,7 @@ def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
|
|||||||
format_table_content(f, data, 8)
|
format_table_content(f, data, 8)
|
||||||
f.write("\n ];\n\n")
|
f.write("\n ];\n\n")
|
||||||
|
|
||||||
|
|
||||||
def compute_trie(rawdata, chunksize):
|
def compute_trie(rawdata, chunksize):
|
||||||
root = []
|
root = []
|
||||||
childmap = {}
|
childmap = {}
|
||||||
@ -288,10 +420,11 @@ def compute_trie(rawdata, chunksize):
|
|||||||
childmap[child] = len(childmap)
|
childmap[child] = len(childmap)
|
||||||
child_data.extend(data)
|
child_data.extend(data)
|
||||||
root.append(childmap[child])
|
root.append(childmap[child])
|
||||||
return (root, child_data)
|
return root, child_data
|
||||||
|
|
||||||
|
|
||||||
def emit_bool_trie(f, name, t_data, is_pub=True):
|
def emit_bool_trie(f, name, t_data, is_pub=True):
|
||||||
CHUNK = 64
|
chunk_size = 64
|
||||||
rawdata = [False] * 0x110000
|
rawdata = [False] * 0x110000
|
||||||
for (lo, hi) in t_data:
|
for (lo, hi) in t_data:
|
||||||
for cp in range(lo, hi + 1):
|
for cp in range(lo, hi + 1):
|
||||||
@ -299,7 +432,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
|
|
||||||
# convert to bitmap chunks of 64 bits each
|
# convert to bitmap chunks of 64 bits each
|
||||||
chunks = []
|
chunks = []
|
||||||
for i in range(0x110000 // CHUNK):
|
for i in range(0x110000 // chunk_size):
|
||||||
chunk = 0
|
chunk = 0
|
||||||
for j in range(64):
|
for j in range(64):
|
||||||
if rawdata[i * 64 + j]:
|
if rawdata[i * 64 + j]:
|
||||||
@ -311,12 +444,12 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
pub_string = "pub "
|
pub_string = "pub "
|
||||||
f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
|
f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
|
||||||
f.write(" r1: [\n")
|
f.write(" r1: [\n")
|
||||||
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK])
|
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // chunk_size])
|
||||||
format_table_content(f, data, 12)
|
format_table_content(f, data, 12)
|
||||||
f.write("\n ],\n")
|
f.write("\n ],\n")
|
||||||
|
|
||||||
# 0x800..0x10000 trie
|
# 0x800..0x10000 trie
|
||||||
(r2, r3) = compute_trie(chunks[0x800 // CHUNK : 0x10000 // CHUNK], 64 // CHUNK)
|
(r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
|
||||||
f.write(" r2: [\n")
|
f.write(" r2: [\n")
|
||||||
data = ','.join(str(node) for node in r2)
|
data = ','.join(str(node) for node in r2)
|
||||||
format_table_content(f, data, 12)
|
format_table_content(f, data, 12)
|
||||||
@ -327,7 +460,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
f.write("\n ],\n")
|
f.write("\n ],\n")
|
||||||
|
|
||||||
# 0x10000..0x110000 trie
|
# 0x10000..0x110000 trie
|
||||||
(mid, r6) = compute_trie(chunks[0x10000 // CHUNK : 0x110000 // CHUNK], 64 // CHUNK)
|
(mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size)
|
||||||
(r4, r5) = compute_trie(mid, 64)
|
(r4, r5) = compute_trie(mid, 64)
|
||||||
f.write(" r4: [\n")
|
f.write(" r4: [\n")
|
||||||
data = ','.join(str(node) for node in r4)
|
data = ','.join(str(node) for node in r4)
|
||||||
@ -344,6 +477,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
|
|
||||||
f.write(" };\n\n")
|
f.write(" };\n\n")
|
||||||
|
|
||||||
|
|
||||||
def emit_small_bool_trie(f, name, t_data, is_pub=True):
|
def emit_small_bool_trie(f, name, t_data, is_pub=True):
|
||||||
last_chunk = max(hi // 64 for (lo, hi) in t_data)
|
last_chunk = max(hi // 64 for (lo, hi) in t_data)
|
||||||
n_chunks = last_chunk + 1
|
n_chunks = last_chunk + 1
|
||||||
@ -374,6 +508,7 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
|
|
||||||
f.write(" };\n\n")
|
f.write(" };\n\n")
|
||||||
|
|
||||||
|
|
||||||
def emit_property_module(f, mod, tbl, emit):
|
def emit_property_module(f, mod, tbl, emit):
|
||||||
f.write("pub mod %s {\n" % mod)
|
f.write("pub mod %s {\n" % mod)
|
||||||
for cat in sorted(emit):
|
for cat in sorted(emit):
|
||||||
@ -389,7 +524,8 @@ def emit_property_module(f, mod, tbl, emit):
|
|||||||
f.write(" }\n\n")
|
f.write(" }\n\n")
|
||||||
f.write("}\n\n")
|
f.write("}\n\n")
|
||||||
|
|
||||||
def emit_conversions_module(f, to_upper, to_lower, to_title):
|
|
||||||
|
def emit_conversions_module(f, unicode_data):
|
||||||
f.write("pub mod conversions {")
|
f.write("pub mod conversions {")
|
||||||
f.write("""
|
f.write("""
|
||||||
pub fn to_lower(c: char) -> [char; 3] {
|
pub fn to_lower(c: char) -> [char; 3] {
|
||||||
@ -414,74 +550,104 @@ def emit_conversions_module(f, to_upper, to_lower, to_title):
|
|||||||
t_type = "&[(char, [char; 3])]"
|
t_type = "&[(char, [char; 3])]"
|
||||||
pfun = lambda x: "(%s,[%s,%s,%s])" % (
|
pfun = lambda x: "(%s,[%s,%s,%s])" % (
|
||||||
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
|
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
|
||||||
emit_table(f, "to_lowercase_table",
|
|
||||||
sorted(to_lower.items(), key=operator.itemgetter(0)),
|
|
||||||
is_pub=False, t_type = t_type, pfun=pfun)
|
|
||||||
emit_table(f, "to_uppercase_table",
|
|
||||||
sorted(to_upper.items(), key=operator.itemgetter(0)),
|
|
||||||
is_pub=False, t_type = t_type, pfun=pfun)
|
|
||||||
f.write("}\n\n")
|
|
||||||
|
|
||||||
def emit_norm_module(f, canon, compat, combine, norm_props):
|
emit_table(f,
|
||||||
canon_keys = sorted(canon.keys())
|
name="to_lowercase_table",
|
||||||
|
t_data=sorted(unicode_data.to_lower.items(), key=operator.itemgetter(0)),
|
||||||
|
t_type=t_type,
|
||||||
|
is_pub=False,
|
||||||
|
pfun=pfun)
|
||||||
|
|
||||||
compat_keys = sorted(compat.keys())
|
emit_table(f,
|
||||||
|
name="to_uppercase_table",
|
||||||
|
t_data=sorted(unicode_data.to_upper.items(), key=operator.itemgetter(0)),
|
||||||
|
t_type=t_type,
|
||||||
|
is_pub=False,
|
||||||
|
pfun=pfun)
|
||||||
|
|
||||||
|
f.write("}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def emit_norm_module(f, unicode_data, norm_props):
|
||||||
|
canon_keys = sorted(unicode_data.canon_decomp.keys())
|
||||||
|
|
||||||
canon_comp = {}
|
canon_comp = {}
|
||||||
comp_exclusions = norm_props["Full_Composition_Exclusion"]
|
comp_exclusions = norm_props["Full_Composition_Exclusion"]
|
||||||
for char in canon_keys:
|
for char in canon_keys:
|
||||||
if any(lo <= char <= hi for lo, hi in comp_exclusions):
|
if any(lo <= char <= hi for lo, hi in comp_exclusions):
|
||||||
continue
|
continue
|
||||||
decomp = canon[char]
|
decomp = unicode_data.canon_decomp[char]
|
||||||
if len(decomp) == 2:
|
if len(decomp) == 2:
|
||||||
if decomp[0] not in canon_comp:
|
if decomp[0] not in canon_comp:
|
||||||
canon_comp[decomp[0]] = []
|
canon_comp[decomp[0]] = []
|
||||||
canon_comp[decomp[0]].append( (decomp[1], char) )
|
canon_comp[decomp[0]].append((decomp[1], char))
|
||||||
canon_comp_keys = sorted(canon_comp.keys())
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
r = fdir + "tables.rs"
|
def parse_args():
|
||||||
if os.path.exists(r):
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
os.remove(r)
|
parser.add_argument("-v", "--version", default=None, type=str,
|
||||||
with open(r, "w") as rf:
|
help="Unicode version to use (if not specified,"
|
||||||
|
" defaults to latest available final release).")
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
unicode_version = fetch_files(args.version)
|
||||||
|
print("Using Unicode version: {}".format(unicode_version.as_str))
|
||||||
|
|
||||||
|
tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
|
||||||
|
if os.path.exists(tables_rs_path):
|
||||||
|
os.remove(tables_rs_path)
|
||||||
|
|
||||||
|
with open(tables_rs_path, "w") as rf:
|
||||||
# write the file's preamble
|
# write the file's preamble
|
||||||
rf.write(preamble)
|
rf.write(PREAMBLE)
|
||||||
|
|
||||||
# download and parse all the data
|
unicode_version_notice = textwrap.dedent("""
|
||||||
fetch("ReadMe.txt")
|
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
||||||
with open(fdir + "ReadMe.txt") as readme:
|
/// `char` and `str` methods are based on.
|
||||||
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
|
#[unstable(feature = "unicode_version", issue = "49726")]
|
||||||
unicode_version = re.search(pattern, readme.read()).groups()
|
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{
|
||||||
rf.write("""
|
major: {version.major},
|
||||||
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
minor: {version.minor},
|
||||||
/// `char` and `str` methods are based on.
|
micro: {version.micro},
|
||||||
#[unstable(feature = "unicode_version", issue = "49726")]
|
|
||||||
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
|
|
||||||
major: %s,
|
|
||||||
minor: %s,
|
|
||||||
micro: %s,
|
|
||||||
_priv: (),
|
_priv: (),
|
||||||
};
|
}};
|
||||||
""" % unicode_version)
|
""").format(version=unicode_version)
|
||||||
(canon_decomp, compat_decomp, gencats, combines,
|
rf.write(unicode_version_notice)
|
||||||
to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
|
|
||||||
load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
|
get_path = lambda f: get_unicode_file_path(unicode_version, f)
|
||||||
|
|
||||||
|
unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
|
||||||
|
load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
|
||||||
|
|
||||||
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
|
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
|
||||||
"Cased", "Case_Ignorable", "Grapheme_Extend"]
|
"Cased", "Case_Ignorable", "Grapheme_Extend"]
|
||||||
derived = load_properties("DerivedCoreProperties.txt", want_derived)
|
derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
|
||||||
scripts = load_properties("Scripts.txt", [])
|
|
||||||
props = load_properties("PropList.txt",
|
# TODO scripts not used?
|
||||||
["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
|
scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), [])
|
||||||
norm_props = load_properties("DerivedNormalizationProps.txt",
|
props = load_properties(get_path(UnicodeFiles.PROPS),
|
||||||
|
["White_Space", "Join_Control", "Noncharacter_Code_Point",
|
||||||
|
"Pattern_White_Space"])
|
||||||
|
norm_props = load_properties(get_path(UnicodeFiles.DERIVED_NORMALIZATION_PROPS),
|
||||||
["Full_Composition_Exclusion"])
|
["Full_Composition_Exclusion"])
|
||||||
|
|
||||||
# category tables
|
# category tables
|
||||||
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
for (name, cat, pfuns) in (("general_category", unicode_data.gencats, ["N", "Cc"]),
|
||||||
("derived_property", derived, want_derived), \
|
("derived_property", derived, want_derived),
|
||||||
("property", props, ["White_Space", "Pattern_White_Space"]):
|
("property", props, ["White_Space", "Pattern_White_Space"])):
|
||||||
emit_property_module(rf, name, cat, pfuns)
|
emit_property_module(rf, name, cat, pfuns)
|
||||||
|
|
||||||
# normalizations and conversions module
|
# normalizations and conversions module
|
||||||
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
|
emit_norm_module(rf, unicode_data, norm_props)
|
||||||
emit_conversions_module(rf, to_upper, to_lower, to_title)
|
emit_conversions_module(rf, unicode_data)
|
||||||
|
|
||||||
print("Regenerated tables.rs.")
|
print("Regenerated tables.rs.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user