mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-02 15:32:06 +00:00
Add libunicode; move unicode functions from core
- created new crate, libunicode, below libstd - split Char trait into Char (libcore) and UnicodeChar (libunicode) - Unicode-aware functions now live in libunicode - is_alphabetic, is_XID_start, is_XID_continue, is_lowercase, is_uppercase, is_whitespace, is_alphanumeric, is_control, is_digit, to_uppercase, to_lowercase - added width method in UnicodeChar trait - determines printed width of character in columns, or None if it is a non-NULL control character - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise) - split StrSlice into StrSlice (libcore) and UnicodeStrSlice (libunicode) - functionality formerly in StrSlice that relied upon Unicode functionality from Char is now in UnicodeStrSlice - words, is_whitespace, is_alphanumeric, trim, trim_left, trim_right - also moved Words type alias into libunicode because words method is in UnicodeStrSlice - unified Unicode tables from libcollections, libcore, and libregex into libunicode - updated unicode.py in src/etc to generate aforementioned tables - generated new tables based on latest Unicode data - added UnicodeChar and UnicodeStrSlice traits to prelude - libunicode is now the collection point for the std::char module, combining the libunicode functionality with the Char functionality from libcore - thus, moved doc comment for char from core::char to unicode::char - libcollections remains the collection point for std::str The Unicode-aware functions that previously lived in the Char and StrSlice traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and use the UnicodeChar and/or UnicodeStrSlice traits: extern crate unicode; use unicode::UnicodeChar; use unicode::UnicodeStrSlice; use unicode::Words; // if you want to use the words() method NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude. closes #15224 [breaking-change]
This commit is contained in:
parent
4f120e6baf
commit
5d4238b6fc
@ -51,17 +51,19 @@
|
||||
|
||||
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
|
||||
uuid serialize sync getopts collections num test time rand \
|
||||
url log regex graphviz core rlibc alloc debug rustrt
|
||||
url log regex graphviz core rlibc alloc debug rustrt \
|
||||
unicode
|
||||
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros
|
||||
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
|
||||
TOOLS := compiletest rustdoc rustc
|
||||
|
||||
DEPS_core :=
|
||||
DEPS_rlibc :=
|
||||
DEPS_unicode := core
|
||||
DEPS_alloc := core libc native:jemalloc
|
||||
DEPS_debug := std
|
||||
DEPS_rustrt := alloc core libc collections native:rustrt_native
|
||||
DEPS_std := core libc rand alloc collections rustrt sync \
|
||||
DEPS_std := core libc rand alloc collections rustrt sync unicode \
|
||||
native:rust_builtin native:backtrace
|
||||
DEPS_graphviz := std
|
||||
DEPS_green := std native:context_switch
|
||||
@ -82,7 +84,7 @@ DEPS_semver := std
|
||||
DEPS_uuid := std serialize
|
||||
DEPS_sync := core alloc rustrt collections
|
||||
DEPS_getopts := std
|
||||
DEPS_collections := core alloc
|
||||
DEPS_collections := core alloc unicode
|
||||
DEPS_fourcc := rustc syntax std
|
||||
DEPS_hexfloat := rustc syntax std
|
||||
DEPS_num := std
|
||||
@ -108,6 +110,7 @@ ONLY_RLIB_rlibc := 1
|
||||
ONLY_RLIB_alloc := 1
|
||||
ONLY_RLIB_rand := 1
|
||||
ONLY_RLIB_collections := 1
|
||||
ONLY_RLIB_unicode := 1
|
||||
|
||||
################################################################################
|
||||
# You should not need to edit below this line
|
||||
|
@ -1,183 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
import datetime
|
||||
import urllib2
|
||||
|
||||
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
|
||||
DATA = 'UnicodeData.txt'
|
||||
SCRIPTS = 'Scripts.txt'
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
|
||||
def as_4byte_uni(n):
|
||||
s = hex(n)[2:]
|
||||
return '\\U%s%s' % ('0' * (8 - len(s)), s)
|
||||
|
||||
|
||||
def expand_cat(c):
|
||||
return expanded_categories.get(c, []) + [c]
|
||||
|
||||
|
||||
def is_valid_unicode(n):
|
||||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
|
||||
|
||||
|
||||
def read_cats(f):
|
||||
assigned = defaultdict(list)
|
||||
for row in csv.reader(f, delimiter=';'):
|
||||
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
|
||||
if not is_valid_unicode(hex):
|
||||
continue
|
||||
for cat in cats:
|
||||
assigned[cat].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def read_scripts(f):
|
||||
assigned = defaultdict(list)
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
hexes, name = map(str.strip, line.split(';'))[:2]
|
||||
name = name[:name.index('#')].strip()
|
||||
if '..' not in hexes:
|
||||
hex = int(hexes, 16)
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
else:
|
||||
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
|
||||
for hex in xrange(hex1, hex2 + 1):
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def group(letters):
|
||||
letters = sorted(set(letters))
|
||||
grouped = []
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
|
||||
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
grouped.append((cur_start, cur_end))
|
||||
cur_start, cur_end = letter, letter
|
||||
grouped.append((cur_start, cur_end))
|
||||
return grouped
|
||||
|
||||
|
||||
def ranges_to_rust(rs):
|
||||
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
|
||||
return ',\n '.join(rs)
|
||||
|
||||
|
||||
def groups_to_rust(groups):
|
||||
rust_groups = []
|
||||
for group_name in sorted(groups):
|
||||
rust_groups.append('("%s", &[\n %s\n ]),'
|
||||
% (group_name, ranges_to_rust(groups[group_name])))
|
||||
return '\n'.join(rust_groups)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate Unicode character class tables.')
|
||||
aa = parser.add_argument
|
||||
aa('--local', action='store_true',
|
||||
help='When set, Scripts.txt and UnicodeData.txt will be read from '
|
||||
'the CWD.')
|
||||
aa('--base-url', type=str, default=BASE_URL,
|
||||
help='The base URL to use for downloading Unicode data files.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local:
|
||||
cats = read_cats(open(DATA))
|
||||
scripts = read_scripts(open(SCRIPTS))
|
||||
else:
|
||||
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
|
||||
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
|
||||
|
||||
# Get Rust code for all Unicode general categories and scripts.
|
||||
combined = dict(cats, **scripts)
|
||||
unigroups = groups_to_rust({k: group(letters)
|
||||
for k, letters in combined.items()})
|
||||
|
||||
# Now get Perl character classes that are Unicode friendly.
|
||||
perld = range(ord('0'), ord('9') + 1)
|
||||
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
|
||||
|
||||
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
|
||||
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
|
||||
|
||||
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
|
||||
perlw = [ord('_')] + perld + low + up
|
||||
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
|
||||
|
||||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
|
||||
// on {date}.
|
||||
|
||||
use parse::{{Class, NamedClasses}};
|
||||
|
||||
pub static UNICODE_CLASSES: NamedClasses = &[
|
||||
|
||||
{groups}
|
||||
|
||||
];
|
||||
|
||||
pub static PERLD: Class = &[
|
||||
{dgroups}
|
||||
];
|
||||
|
||||
pub static PERLS: Class = &[
|
||||
{sgroups}
|
||||
];
|
||||
|
||||
pub static PERLW: Class = &[
|
||||
{wgroups}
|
||||
];
|
||||
'''
|
||||
now = datetime.datetime.now()
|
||||
print(tpl.format(date=str(now), groups=unigroups,
|
||||
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
|
@ -10,17 +10,46 @@
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
|
||||
# code covering the core properties. Since this is a pretty rare event we
|
||||
# just store this out-of-line and check the unicode.rs file into git.
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedCoreProperties.txt
|
||||
# - EastAsianWidth.txt
|
||||
# - PropList.txt
|
||||
# - Scripts.txt
|
||||
# - UnicodeData.txt
|
||||
#
|
||||
# The emitted code is "the minimum we think is necessary for libstd", that
|
||||
# is, to support basic operations of the compiler and "most nontrivial rust
|
||||
# programs". It is not meant to be a complete implementation of unicode.
|
||||
# For that we recommend you use a proper binding to libicu.
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the unicode.rs file into git.
|
||||
|
||||
import fileinput, re, os, sys, operator
|
||||
|
||||
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics, non_snake_case_functions)]
|
||||
'''
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
def fetch(f):
|
||||
if not os.path.exists(f):
|
||||
@ -31,21 +60,17 @@ def fetch(f):
|
||||
sys.stderr.write("cannot load %s" % f)
|
||||
exit(1)
|
||||
|
||||
def is_valid_unicode(n):
|
||||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
|
||||
|
||||
def load_unicode_data(f):
|
||||
fetch(f)
|
||||
gencats = {}
|
||||
upperlower = {}
|
||||
lowerupper = {}
|
||||
combines = []
|
||||
combines = {}
|
||||
canon_decomp = {}
|
||||
compat_decomp = {}
|
||||
curr_cat = ""
|
||||
curr_combine = ""
|
||||
c_lo = 0
|
||||
c_hi = 0
|
||||
com_lo = 0
|
||||
com_hi = 0
|
||||
|
||||
for line in fileinput.input(f):
|
||||
fields = line.split(";")
|
||||
@ -58,6 +83,9 @@ def load_unicode_data(f):
|
||||
code_org = code
|
||||
code = int(code, 16)
|
||||
|
||||
if not is_valid_unicode(code):
|
||||
continue
|
||||
|
||||
# generate char to char direct common and simple conversions
|
||||
# uppercase to lowercase
|
||||
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
|
||||
@ -67,6 +95,7 @@ def load_unicode_data(f):
|
||||
if gencat == "Ll" and upcase != "" and code_org != upcase:
|
||||
lowerupper[code] = int(upcase, 16)
|
||||
|
||||
# store decomposition, if given
|
||||
if decomp != "":
|
||||
if decomp.startswith('<'):
|
||||
seq = []
|
||||
@ -79,38 +108,76 @@ def load_unicode_data(f):
|
||||
seq.append(int(i, 16))
|
||||
canon_decomp[code] = seq
|
||||
|
||||
if curr_cat == "":
|
||||
curr_cat = gencat
|
||||
c_lo = code
|
||||
c_hi = code
|
||||
# place letter in categories as appropriate
|
||||
for cat in [gencat] + expanded_categories.get(gencat, []):
|
||||
if cat not in gencats:
|
||||
gencats[cat] = []
|
||||
gencats[cat].append(code)
|
||||
|
||||
if curr_cat == gencat:
|
||||
c_hi = code
|
||||
else:
|
||||
if curr_cat not in gencats:
|
||||
gencats[curr_cat] = []
|
||||
# record combining class, if any
|
||||
if combine != "0":
|
||||
if combine not in combines:
|
||||
combines[combine] = []
|
||||
combines[combine].append(code)
|
||||
|
||||
gencats[curr_cat].append((c_lo, c_hi))
|
||||
curr_cat = gencat
|
||||
c_lo = code
|
||||
c_hi = code
|
||||
|
||||
if curr_combine == "":
|
||||
curr_combine = combine
|
||||
com_lo = code
|
||||
com_hi = code
|
||||
|
||||
if curr_combine == combine:
|
||||
com_hi = code
|
||||
else:
|
||||
if curr_combine != "0":
|
||||
combines.append((com_lo, com_hi, curr_combine))
|
||||
curr_combine = combine
|
||||
com_lo = code
|
||||
com_hi = code
|
||||
gencats = group_cats(gencats)
|
||||
combines = to_combines(group_cats(combines))
|
||||
|
||||
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
|
||||
|
||||
def group_cats(cats):
|
||||
cats_out = {}
|
||||
for cat in cats:
|
||||
cats_out[cat] = group_cat(cats[cat])
|
||||
return cats_out
|
||||
|
||||
def group_cat(cat):
|
||||
cat_out = []
|
||||
letters = sorted(set(cat))
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
cat_out.append((cur_start, cur_end))
|
||||
cur_start = cur_end = letter
|
||||
cat_out.append((cur_start, cur_end))
|
||||
return cat_out
|
||||
|
||||
def ungroup_cat(cat):
|
||||
cat_out = []
|
||||
for (lo, hi) in cat:
|
||||
while lo <= hi:
|
||||
cat_out.append(lo)
|
||||
lo += 1
|
||||
return cat_out
|
||||
|
||||
def to_combines(combs):
|
||||
combs_out = []
|
||||
for comb in combs:
|
||||
for (lo, hi) in combs[comb]:
|
||||
combs_out.append((lo, hi, comb))
|
||||
combs_out.sort(key=lambda comb: comb[0])
|
||||
return combs_out
|
||||
|
||||
def format_table_content(f, content, indent):
|
||||
line = " "*indent
|
||||
first = True
|
||||
for chunk in content.split(","):
|
||||
if len(line) + len(chunk) < 98:
|
||||
if first:
|
||||
line += chunk
|
||||
else:
|
||||
line += ", " + chunk
|
||||
first = False
|
||||
else:
|
||||
f.write(line + ",\n")
|
||||
line = " "*indent + chunk
|
||||
f.write(line)
|
||||
|
||||
def load_properties(f, interestingprops):
|
||||
fetch(f)
|
||||
props = {}
|
||||
@ -134,7 +201,7 @@ def load_properties(f, interestingprops):
|
||||
prop = m.group(3)
|
||||
else:
|
||||
continue
|
||||
if prop not in interestingprops:
|
||||
if interestingprops and prop not in interestingprops:
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
@ -143,6 +210,43 @@ def load_properties(f, interestingprops):
|
||||
props[prop].append((d_lo, d_hi))
|
||||
return props
|
||||
|
||||
# load all widths of want_widths, except those in except_cats
|
||||
def load_east_asian_width(want_widths, except_cats):
|
||||
f = "EastAsianWidth.txt"
|
||||
fetch(f)
|
||||
widths = {}
|
||||
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
|
||||
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
|
||||
|
||||
for line in fileinput.input(f):
|
||||
width = None
|
||||
d_lo = 0
|
||||
d_hi = 0
|
||||
cat = None
|
||||
m = re1.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(1)
|
||||
width = m.group(2)
|
||||
cat = m.group(3)
|
||||
else:
|
||||
m = re2.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(2)
|
||||
width = m.group(3)
|
||||
cat = m.group(4)
|
||||
else:
|
||||
continue
|
||||
if cat in except_cats or width not in want_widths:
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
if width not in widths:
|
||||
widths[width] = []
|
||||
widths[width].append((d_lo, d_hi))
|
||||
return widths
|
||||
|
||||
def escape_char(c):
|
||||
if c <= 0xff:
|
||||
return "'\\x%2.2x'" % c
|
||||
@ -150,59 +254,72 @@ def escape_char(c):
|
||||
return "'\\u%4.4x'" % c
|
||||
return "'\\U%8.8x'" % c
|
||||
|
||||
def ch_prefix(ix):
|
||||
if ix == 0:
|
||||
return " "
|
||||
if ix % 2 == 0:
|
||||
return ",\n "
|
||||
else:
|
||||
return ", "
|
||||
|
||||
def emit_bsearch_range_table(f):
|
||||
f.write("""
|
||||
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use slice::ImmutableVector;
|
||||
use option::None;
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
use core::option::None;
|
||||
r.bsearch(|&(lo,hi)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) != None
|
||||
}\n
|
||||
""");
|
||||
""")
|
||||
|
||||
def emit_property_module(f, mod, tbl):
|
||||
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
|
||||
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
||||
pub_string = ""
|
||||
if is_pub:
|
||||
pub_string = "pub "
|
||||
f.write(" %sstatic %s: %s = &[\n" % (pub_string, name, t_type))
|
||||
data = ""
|
||||
first = True
|
||||
for dat in t_data:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += pfun(dat)
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
def emit_property_module(f, mod, tbl, emit_fn):
|
||||
f.write("pub mod %s {\n" % mod)
|
||||
keys = tbl.keys()
|
||||
keys.sort()
|
||||
|
||||
for cat in keys:
|
||||
if cat not in ["Nd", "Nl", "No", "Cc",
|
||||
"XID_Start", "XID_Continue", "Alphabetic",
|
||||
"Lowercase", "Uppercase", "White_Space"]:
|
||||
continue
|
||||
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
|
||||
ix = 0
|
||||
for pair in tbl[cat]:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
|
||||
f.write(" }\n\n")
|
||||
emit_table(f, "%s_table" % cat, tbl[cat])
|
||||
if cat in emit_fn:
|
||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
|
||||
f.write(" }\n\n")
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_regex_module(f, cats, w_data):
|
||||
f.write("pub mod regex {\n")
|
||||
regex_class = "&'static [(char, char)]"
|
||||
class_table = "&'static [(&'static str, %s)]" % regex_class
|
||||
|
||||
emit_table(f, "UNICODE_CLASSES", cats, class_table,
|
||||
pfun=lambda x: "(\"%s\",super::%s::%s_table)" % (x[0], x[1], x[0]))
|
||||
|
||||
f.write(" pub static PERLD: %s = super::general_category::Nd_table;\n\n"
|
||||
% regex_class)
|
||||
f.write(" pub static PERLS: %s = super::property::White_Space_table;\n\n"
|
||||
% regex_class)
|
||||
|
||||
emit_table(f, "PERLW", w_data, regex_class)
|
||||
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_conversions_module(f, lowerupper, upperlower):
|
||||
f.write("pub mod conversions {")
|
||||
f.write("""
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use slice::ImmutableVector;
|
||||
use tuple::Tuple2;
|
||||
use option::{Option, Some, None};
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
use core::tuple::Tuple2;
|
||||
use core::option::{Option, Some, None};
|
||||
|
||||
pub fn to_lower(c: char) -> char {
|
||||
match bsearch_case_table(c, LuLl_table) {
|
||||
@ -226,189 +343,88 @@ def emit_conversions_module(f, lowerupper, upperlower):
|
||||
})
|
||||
}
|
||||
|
||||
""");
|
||||
emit_caseconversion_table(f, "LuLl", upperlower)
|
||||
emit_caseconversion_table(f, "LlLu", lowerupper)
|
||||
""")
|
||||
emit_table(f, "LuLl_table",
|
||||
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
||||
emit_table(f, "LlLu_table",
|
||||
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_charwidth_module(f, width_table):
|
||||
f.write("pub mod charwidth {\n")
|
||||
f.write(" use core::option::{Option, Some, None};\n")
|
||||
f.write(" use core::slice::ImmutableVector;\n")
|
||||
f.write("""
|
||||
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(lo, hi, _, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, _, r_ncjk, r_cjk) = r[idx];
|
||||
if is_cjk { r_cjk } else { r_ncjk }
|
||||
}
|
||||
None => 1
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
f.write("""
|
||||
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
|
||||
match c as uint {
|
||||
_c @ 0 => Some(0), // null is zero width
|
||||
cu if cu < 0x20 => None, // control sequences have no width
|
||||
cu if cu < 0x7F => Some(1), // ASCII
|
||||
cu if cu < 0xA0 => None, // more control sequences
|
||||
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as uint)
|
||||
}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
|
||||
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
|
||||
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
|
||||
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
|
||||
f.write("}\n")
|
||||
|
||||
def emit_caseconversion_table(f, name, table):
|
||||
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
|
||||
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
|
||||
ix = 0
|
||||
for key, value in sorted_table:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
def format_table_content(f, content, indent):
|
||||
line = " "*indent
|
||||
first = True
|
||||
for chunk in content.split(","):
|
||||
if len(line) + len(chunk) < 98:
|
||||
if first:
|
||||
line += chunk
|
||||
else:
|
||||
line += ", " + chunk
|
||||
first = False
|
||||
else:
|
||||
f.write(line + ",\n")
|
||||
line = " "*indent + chunk
|
||||
f.write(line)
|
||||
|
||||
def emit_core_norm_module(f, canon, compat):
|
||||
def emit_norm_module(f, canon, compat, combine):
|
||||
canon_keys = canon.keys()
|
||||
canon_keys.sort()
|
||||
|
||||
compat_keys = compat.keys()
|
||||
compat_keys.sort()
|
||||
f.write("pub mod normalization {\n");
|
||||
f.write(" use option::Option;\n");
|
||||
f.write(" use option::{Some, None};\n");
|
||||
f.write(" use slice::ImmutableVector;\n");
|
||||
f.write("""
|
||||
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(val, _)| {
|
||||
if c == val { Equal }
|
||||
else if val < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, result) = r[idx];
|
||||
Some(result)
|
||||
}
|
||||
None => None
|
||||
}
|
||||
}\n\n
|
||||
""")
|
||||
|
||||
f.write("pub mod normalization {\n")
|
||||
|
||||
def mkdata_fun(table):
|
||||
def f(char):
|
||||
data = "(%s,&[" % escape_char(char)
|
||||
first = True
|
||||
for d in table[char]:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
return data
|
||||
return f
|
||||
|
||||
f.write(" // Canonical decompositions\n")
|
||||
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
|
||||
data = ""
|
||||
first = True
|
||||
for char in canon_keys:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += "(%s,&[" % escape_char(char)
|
||||
first2 = True
|
||||
for d in canon[char]:
|
||||
if not first2:
|
||||
data += ","
|
||||
first2 = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
|
||||
pfun=mkdata_fun(canon))
|
||||
|
||||
f.write(" // Compatibility decompositions\n")
|
||||
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
|
||||
data = ""
|
||||
first = True
|
||||
for char in compat_keys:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += "(%s,&[" % escape_char(char)
|
||||
first2 = True
|
||||
for d in compat[char]:
|
||||
if not first2:
|
||||
data += ","
|
||||
first2 = False
|
||||
data += escape_char(d)
|
||||
data += "])"
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
f.write("""
|
||||
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
|
||||
|
||||
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
|
||||
|
||||
fn d(c: char, i: |char|, k: bool) {
|
||||
use iter::Iterator;
|
||||
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\\x7f' { i(c); return; }
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
|
||||
decompose_hangul(c, i);
|
||||
return;
|
||||
}
|
||||
|
||||
// First check the canonical decompositions
|
||||
match bsearch_table(c, canonical_table) {
|
||||
Some(canon) => {
|
||||
for x in canon.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Bottom out if we're not doing compat.
|
||||
if !k { i(c); return; }
|
||||
|
||||
// Then check the compatibility decompositions
|
||||
match bsearch_table(c, compatibility_table) {
|
||||
Some(compat) => {
|
||||
for x in compat.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
i(c);
|
||||
}
|
||||
|
||||
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
|
||||
static S_BASE: u32 = 0xAC00;
|
||||
static L_BASE: u32 = 0x1100;
|
||||
static V_BASE: u32 = 0x1161;
|
||||
static T_BASE: u32 = 0x11A7;
|
||||
static L_COUNT: u32 = 19;
|
||||
static V_COUNT: u32 = 21;
|
||||
static T_COUNT: u32 = 28;
|
||||
static N_COUNT: u32 = (V_COUNT * T_COUNT);
|
||||
static S_COUNT: u32 = (L_COUNT * N_COUNT);
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
fn decompose_hangul(s: char, f: |char|) {
|
||||
use cast::transmute;
|
||||
|
||||
let si = s as u32 - S_BASE;
|
||||
|
||||
let li = si / N_COUNT;
|
||||
unsafe {
|
||||
f(transmute(L_BASE + li));
|
||||
|
||||
let vi = (si % N_COUNT) / T_COUNT;
|
||||
f(transmute(V_BASE + vi));
|
||||
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
f(transmute(T_BASE + ti));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
def emit_std_norm_module(f, combine):
|
||||
f.write("pub mod normalization {\n");
|
||||
f.write(" use option::{Some, None};\n");
|
||||
f.write(" use slice::ImmutableVector;\n");
|
||||
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
|
||||
pfun=mkdata_fun(compat))
|
||||
|
||||
f.write("""
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use core::option::{Some, None};
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
use core::slice::ImmutableVector;
|
||||
match r.bsearch(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
@ -420,72 +436,122 @@ def emit_std_norm_module(f, combine):
|
||||
}
|
||||
None => 0
|
||||
}
|
||||
}\n\n
|
||||
}\n
|
||||
""")
|
||||
|
||||
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
|
||||
ix = 0
|
||||
for pair in combine:
|
||||
f.write(ch_prefix(ix))
|
||||
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
|
||||
ix += 1
|
||||
f.write("\n ];\n\n")
|
||||
emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
|
||||
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
|
||||
|
||||
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
|
||||
+ " bsearch_range_value_table(c, combining_class_table)\n"
|
||||
+ " }\n")
|
||||
f.write("}\n")
|
||||
|
||||
f.write("""
|
||||
}
|
||||
|
||||
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
""")
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
def remove_from_wtable(wtable, val):
|
||||
wtable_out = []
|
||||
while wtable:
|
||||
if wtable[0][1] < val:
|
||||
wtable_out.append(wtable.pop(0))
|
||||
elif wtable[0][0] > val:
|
||||
break
|
||||
else:
|
||||
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
|
||||
if wt_lo == wt_hi == val:
|
||||
continue
|
||||
elif wt_lo == val:
|
||||
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
|
||||
elif wt_hi == val:
|
||||
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
|
||||
else:
|
||||
wtable_out.append((wt_lo, val-1, width, width_cjk))
|
||||
wtable_out.append((val+1, wt_hi, width, width_cjk))
|
||||
if wtable:
|
||||
wtable_out.extend(wtable)
|
||||
return wtable_out
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics)]
|
||||
def optimize_width_table(wtable):
|
||||
wtable_out = []
|
||||
w_this = wtable.pop(0)
|
||||
while wtable:
|
||||
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
|
||||
w_tmp = wtable.pop(0)
|
||||
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
|
||||
else:
|
||||
wtable_out.append(w_this)
|
||||
w_this = wtable.pop(0)
|
||||
wtable_out.append(w_this)
|
||||
return wtable_out
|
||||
|
||||
'''
|
||||
|
||||
(canon_decomp, compat_decomp, gencats,
|
||||
combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
|
||||
|
||||
def gen_core_unicode():
|
||||
r = "core_unicode.rs"
|
||||
if __name__ == "__main__":
|
||||
r = "unicode.rs"
|
||||
if os.path.exists(r):
|
||||
os.remove(r);
|
||||
os.remove(r)
|
||||
with open(r, "w") as rf:
|
||||
# Preamble
|
||||
# write the file's preamble
|
||||
rf.write(preamble)
|
||||
|
||||
emit_bsearch_range_table(rf);
|
||||
emit_property_module(rf, "general_category", gencats)
|
||||
# download and parse all the data
|
||||
(canon_decomp, compat_decomp, gencats, combines,
|
||||
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
|
||||
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
|
||||
other_derived = ["Default_Ignorable_Code_Point"]
|
||||
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
|
||||
scripts = load_properties("Scripts.txt", [])
|
||||
props = load_properties("PropList.txt",
|
||||
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
|
||||
|
||||
emit_core_norm_module(rf, canon_decomp, compat_decomp)
|
||||
# bsearch_range_table is used in all the property modules below
|
||||
emit_bsearch_range_table(rf)
|
||||
|
||||
derived = load_properties("DerivedCoreProperties.txt",
|
||||
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
|
||||
# all of these categories will also be available as \p{} in libregex
|
||||
allcats = []
|
||||
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
||||
("derived_property", derived, want_derived), \
|
||||
("script", scripts, []), \
|
||||
("property", props, ["White_Space"]):
|
||||
emit_property_module(rf, name, cat, pfuns)
|
||||
allcats.extend(map(lambda x: (x, name), cat))
|
||||
allcats.sort(key=lambda c: c[0])
|
||||
|
||||
emit_property_module(rf, "derived_property", derived)
|
||||
# the \w regex corresponds to Alphabetic + Mark + Decimal_Number +
|
||||
# Connector_Punctuation + Join-Control according to UTS#18
|
||||
# http://www.unicode.org/reports/tr18/#Compatibility_Properties
|
||||
perl_words = []
|
||||
for cat in derived["Alphabetic"], gencats["M"], gencats["Nd"], \
|
||||
gencats["Pc"], props["Join_Control"]:
|
||||
perl_words.extend(ungroup_cat(cat))
|
||||
perl_words = group_cat(perl_words)
|
||||
|
||||
props = load_properties("PropList.txt", ["White_Space"])
|
||||
emit_property_module(rf, "property", props)
|
||||
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
|
||||
emit_regex_module(rf, allcats, perl_words)
|
||||
|
||||
# normalizations and conversions module
|
||||
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
|
||||
emit_conversions_module(rf, lowerupper, upperlower)
|
||||
|
||||
def gen_std_unicode():
|
||||
r = "std_unicode.rs"
|
||||
if os.path.exists(r):
|
||||
os.remove(r);
|
||||
with open(r, "w") as rf:
|
||||
# Preamble
|
||||
rf.write(preamble)
|
||||
emit_std_norm_module(rf, combines)
|
||||
# character width module
|
||||
width_table = []
|
||||
for zwcat in ["Me", "Mn", "Cf"]:
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
|
||||
width_table.append((4448, 4607, 0, 0))
|
||||
|
||||
gen_core_unicode()
|
||||
gen_std_unicode()
|
||||
# get widths, except those that are explicitly marked zero-width above
|
||||
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
|
||||
# these are doublewidth
|
||||
for dwcat in ["W", "F"]:
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
|
||||
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
|
||||
|
||||
width_table.sort(key=lambda w: w[0])
|
||||
|
||||
# soft hyphen is not zero width in preformatted text; it's used to indicate
|
||||
# a hyphen inserted to facilitate a linebreak.
|
||||
width_table = remove_from_wtable(width_table, 173)
|
||||
|
||||
# optimize the width table by collapsing adjacent entities when possible
|
||||
width_table = optimize_width_table(width_table)
|
||||
emit_charwidth_module(rf, width_table)
|
||||
|
@ -28,6 +28,7 @@
|
||||
#![allow(unused_attribute)] // NOTE: remove after stage0
|
||||
|
||||
#[phase(plugin, link)] extern crate core;
|
||||
extern crate unicode;
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(test)] extern crate native;
|
||||
@ -69,9 +70,6 @@ pub mod string;
|
||||
pub mod vec;
|
||||
pub mod hash;
|
||||
|
||||
// Internal unicode fiddly bits for the str module
|
||||
mod unicode;
|
||||
|
||||
mod deque;
|
||||
|
||||
/// A trait to represent mutable containers
|
||||
|
@ -69,7 +69,6 @@ is the same as `&[u8]`.
|
||||
|
||||
use core::prelude::*;
|
||||
|
||||
use core::char;
|
||||
use core::default::Default;
|
||||
use core::fmt;
|
||||
use core::cmp;
|
||||
@ -79,15 +78,17 @@ use core::mem;
|
||||
use Collection;
|
||||
use hash;
|
||||
use string::String;
|
||||
use unicode;
|
||||
use vec::Vec;
|
||||
|
||||
pub use core::str::{from_utf8, CharEq, Chars, CharOffsets};
|
||||
pub use core::str::{Bytes, CharSplits};
|
||||
pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits};
|
||||
pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits};
|
||||
pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items};
|
||||
pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items};
|
||||
pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange};
|
||||
pub use core::str::{Str, StrSlice};
|
||||
pub use unicode::{Words, UnicodeStrSlice};
|
||||
|
||||
/*
|
||||
Section: Creating a string
|
||||
@ -283,7 +284,7 @@ pub struct Decompositions<'a> {
|
||||
impl<'a> Iterator<char> for Decompositions<'a> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use unicode::normalization::canonical_combining_class;
|
||||
use unicode::canonical_combining_class;
|
||||
|
||||
match self.buffer.as_slice().head() {
|
||||
Some(&(c, 0)) => {
|
||||
@ -299,8 +300,8 @@ impl<'a> Iterator<char> for Decompositions<'a> {
|
||||
}
|
||||
|
||||
let decomposer = match self.kind {
|
||||
Canonical => char::decompose_canonical,
|
||||
Compatible => char::decompose_compatible
|
||||
Canonical => unicode::char::decompose_canonical,
|
||||
Compatible => unicode::char::decompose_compatible
|
||||
};
|
||||
|
||||
if !self.sorted {
|
||||
|
@ -1,183 +0,0 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_doc, non_uppercase_statics)]
|
||||
|
||||
pub mod normalization {
|
||||
use core::prelude::*;
|
||||
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
|
||||
match r.bsearch(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, _, result) = r[idx];
|
||||
result
|
||||
}
|
||||
None => 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static combining_class_table : &'static [(char, char, u8)] = &[
|
||||
('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232),
|
||||
('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232),
|
||||
('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220),
|
||||
('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220),
|
||||
('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220),
|
||||
('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220),
|
||||
('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240),
|
||||
('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220),
|
||||
('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220),
|
||||
('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220),
|
||||
('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232),
|
||||
('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230),
|
||||
('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234),
|
||||
('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234),
|
||||
('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230),
|
||||
('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220),
|
||||
('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220),
|
||||
('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222),
|
||||
('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230),
|
||||
('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230),
|
||||
('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230),
|
||||
('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228),
|
||||
('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10),
|
||||
('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12),
|
||||
('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14),
|
||||
('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16),
|
||||
('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18),
|
||||
('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20),
|
||||
('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22),
|
||||
('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24),
|
||||
('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230),
|
||||
('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18),
|
||||
('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30),
|
||||
('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32),
|
||||
('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28),
|
||||
('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30),
|
||||
('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32),
|
||||
('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34),
|
||||
('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220),
|
||||
('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220),
|
||||
('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220),
|
||||
('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230),
|
||||
('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220),
|
||||
('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230),
|
||||
('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230),
|
||||
('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36),
|
||||
('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220),
|
||||
('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220),
|
||||
('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220),
|
||||
('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220),
|
||||
('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220),
|
||||
('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220),
|
||||
('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220),
|
||||
('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220),
|
||||
('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220),
|
||||
('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230),
|
||||
('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230),
|
||||
('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230),
|
||||
('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230),
|
||||
('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230),
|
||||
('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230),
|
||||
('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230),
|
||||
('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27),
|
||||
('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29),
|
||||
('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220),
|
||||
('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220),
|
||||
('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7),
|
||||
('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230),
|
||||
('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230),
|
||||
('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9),
|
||||
('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9),
|
||||
('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9),
|
||||
('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9),
|
||||
('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9),
|
||||
('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91),
|
||||
('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9),
|
||||
('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9),
|
||||
('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9),
|
||||
('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118),
|
||||
('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220),
|
||||
('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220),
|
||||
('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129),
|
||||
('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132),
|
||||
('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130),
|
||||
('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9),
|
||||
('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220),
|
||||
('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9),
|
||||
('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230),
|
||||
('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9),
|
||||
('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230),
|
||||
('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222),
|
||||
('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220),
|
||||
('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220),
|
||||
('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230),
|
||||
('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7),
|
||||
('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230),
|
||||
('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230),
|
||||
('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7),
|
||||
('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7),
|
||||
('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1),
|
||||
('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230),
|
||||
('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230),
|
||||
('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220),
|
||||
('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230),
|
||||
('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230),
|
||||
('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230),
|
||||
('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214),
|
||||
('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202),
|
||||
('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233),
|
||||
('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230),
|
||||
('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230),
|
||||
('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230),
|
||||
('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230),
|
||||
('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1),
|
||||
('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220),
|
||||
('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1),
|
||||
('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230),
|
||||
('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9),
|
||||
('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218),
|
||||
('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232),
|
||||
('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224),
|
||||
('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230),
|
||||
('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230),
|
||||
('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9),
|
||||
('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230),
|
||||
('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9),
|
||||
('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9),
|
||||
('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230),
|
||||
('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230),
|
||||
('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230),
|
||||
('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9),
|
||||
('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230),
|
||||
('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220),
|
||||
('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230),
|
||||
('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220),
|
||||
('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9),
|
||||
('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7),
|
||||
('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9),
|
||||
('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9),
|
||||
('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216),
|
||||
('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226),
|
||||
('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220),
|
||||
('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220),
|
||||
('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230)
|
||||
];
|
||||
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
bsearch_range_value_table(c, combining_class_table)
|
||||
}
|
||||
}
|
@ -8,20 +8,9 @@
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Character manipulation (`char` type, Unicode Scalar Value)
|
||||
//! Character manipulation.
|
||||
//!
|
||||
//! This module provides the `Char` trait, as well as its implementation
|
||||
//! for the primitive `char` type, in order to allow basic character manipulation.
|
||||
//!
|
||||
//! A `char` actually represents a
|
||||
//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
|
||||
//! as it can contain any Unicode code point except high-surrogate and
|
||||
//! low-surrogate code points.
|
||||
//!
|
||||
//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
|
||||
//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
|
||||
//! however the converse is not always true due to the above range limits
|
||||
//! and, as such, should be performed via the `from_u32` function..
|
||||
//! For more details, see ::unicode::char (a.k.a. std::char)
|
||||
|
||||
#![allow(non_snake_case_functions)]
|
||||
#![doc(primitive = "char")]
|
||||
@ -29,12 +18,6 @@
|
||||
use mem::transmute;
|
||||
use option::{None, Option, Some};
|
||||
use iter::{Iterator, range_step};
|
||||
use unicode::{derived_property, property, general_category, conversions};
|
||||
|
||||
/// Returns the canonical decomposition of a character.
|
||||
pub use unicode::normalization::decompose_canonical;
|
||||
/// Returns the compatibility decomposition of a character.
|
||||
pub use unicode::normalization::decompose_compatible;
|
||||
|
||||
// UTF-8 ranges and tags for encoding characters
|
||||
static TAG_CONT: u8 = 0b1000_0000u8;
|
||||
@ -93,84 +76,6 @@ pub fn from_u32(i: u32) -> Option<char> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether the specified `char` is considered a Unicode alphabetic
|
||||
/// code point
|
||||
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in lower case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in upper case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is whitespace
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// As an optimization ASCII whitespace characters are checked separately
|
||||
c == ' '
|
||||
|| ('\x09' <= c && c <= '\x0d')
|
||||
|| property::White_Space(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is alphanumeric
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_alphanumeric(c: char) -> bool {
|
||||
derived_property::Alphabetic(c)
|
||||
|| general_category::Nd(c)
|
||||
|| general_category::Nl(c)
|
||||
|| general_category::No(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is a control code point
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General Category
|
||||
/// 'Cc'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
|
||||
|
||||
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
|
||||
#[inline]
|
||||
pub fn is_digit(c: char) -> bool {
|
||||
general_category::Nd(c)
|
||||
|| general_category::Nl(c)
|
||||
|| general_category::No(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Checks if a `char` parses as a numeric digit in the given radix
|
||||
///
|
||||
@ -227,38 +132,6 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
|
||||
else { None }
|
||||
}
|
||||
|
||||
/// Convert a char to its uppercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping:
|
||||
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
|
||||
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here
|
||||
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion was made
|
||||
#[inline]
|
||||
pub fn to_uppercase(c: char) -> char {
|
||||
conversions::to_upper(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its lowercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping
|
||||
/// see `to_uppercase` for references and more information
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion if possible
|
||||
#[inline]
|
||||
pub fn to_lowercase(c: char) -> char {
|
||||
conversions::to_lower(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Converts a number to the character representing it
|
||||
///
|
||||
@ -355,61 +228,8 @@ pub fn len_utf8_bytes(c: char) -> uint {
|
||||
}
|
||||
}
|
||||
|
||||
/// Useful functions for Unicode characters.
|
||||
/// Basic `char` manipulations.
|
||||
pub trait Char {
|
||||
/// Returns whether the specified character is considered a Unicode
|
||||
/// alphabetic code point.
|
||||
fn is_alphabetic(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified character satisfies the 'XID_Start'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
fn is_XID_start(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
fn is_XID_continue(&self) -> bool;
|
||||
|
||||
|
||||
/// Indicates whether a character is in lowercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Lowercase`.
|
||||
fn is_lowercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is in uppercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Uppercase`.
|
||||
fn is_uppercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is whitespace.
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is alphanumeric.
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is a control code point.
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General
|
||||
/// Category `Cc`.
|
||||
fn is_control(&self) -> bool;
|
||||
|
||||
/// Indicates whether the character is numeric (Nd, Nl, or No).
|
||||
fn is_digit(&self) -> bool;
|
||||
|
||||
/// Checks if a `char` parses as a numeric digit in the given radix.
|
||||
///
|
||||
/// Compared to `is_digit()`, this function only recognizes the characters
|
||||
@ -438,37 +258,6 @@ pub trait Char {
|
||||
/// Fails if given a radix outside the range [0..36].
|
||||
fn to_digit(&self, radix: uint) -> Option<uint>;
|
||||
|
||||
/// Converts a character to its lowercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping. See
|
||||
/// `to_uppercase()` for references and more information.
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the lowercase equivalent of the character, or the character
|
||||
/// itself if no conversion is possible.
|
||||
fn to_lowercase(&self) -> char;
|
||||
|
||||
/// Converts a character to its uppercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping: it maps
|
||||
/// one unicode codepoint (one character in Rust) to its uppercase
|
||||
/// equivalent according to the Unicode database [1]. The additional
|
||||
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here [2].
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the uppercase equivalent of the character, or the character
|
||||
/// itself if no conversion was made.
|
||||
///
|
||||
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
///
|
||||
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
fn to_uppercase(&self) -> char;
|
||||
|
||||
/// Converts a number to the character representing it.
|
||||
///
|
||||
/// # Return value
|
||||
@ -526,32 +315,10 @@ pub trait Char {
|
||||
}
|
||||
|
||||
impl Char for char {
|
||||
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
|
||||
|
||||
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
|
||||
|
||||
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
|
||||
|
||||
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
|
||||
|
||||
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
|
||||
|
||||
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
|
||||
|
||||
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
|
||||
|
||||
fn is_control(&self) -> bool { is_control(*self) }
|
||||
|
||||
fn is_digit(&self) -> bool { is_digit(*self) }
|
||||
|
||||
fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
|
||||
|
||||
fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
|
||||
|
||||
fn to_lowercase(&self) -> char { to_lowercase(*self) }
|
||||
|
||||
fn to_uppercase(&self) -> char { to_uppercase(*self) }
|
||||
|
||||
fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
|
||||
|
||||
fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
|
||||
@ -600,5 +367,3 @@ impl Char for char {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -108,7 +108,6 @@ pub mod collections;
|
||||
|
||||
/* Core types and methods on primitives */
|
||||
|
||||
mod unicode;
|
||||
pub mod any;
|
||||
pub mod atomics;
|
||||
pub mod bool;
|
||||
|
@ -22,7 +22,7 @@ use cmp;
|
||||
use cmp::{PartialEq, Eq};
|
||||
use collections::Collection;
|
||||
use default::Default;
|
||||
use iter::{Filter, Map, Iterator};
|
||||
use iter::{Map, Iterator};
|
||||
use iter::{DoubleEndedIterator, ExactSize};
|
||||
use iter::range;
|
||||
use num::{CheckedMul, Saturating};
|
||||
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
|
||||
invert: bool,
|
||||
}
|
||||
|
||||
/// An iterator over the words of a string, separated by a sequence of whitespace
|
||||
pub type Words<'a> =
|
||||
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
|
||||
|
||||
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
|
||||
pub type AnyLines<'a> =
|
||||
Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
|
||||
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
|
||||
/// ```
|
||||
fn lines_any(&self) -> AnyLines<'a>;
|
||||
|
||||
/// An iterator over the words of a string (subsequences separated
|
||||
/// by any sequence of whitespace). Sequences of whitespace are
|
||||
/// collapsed, so empty "words" are not included.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// let some_words = " Mary had\ta little \n\t lamb";
|
||||
/// let v: Vec<&str> = some_words.words().collect();
|
||||
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
|
||||
/// ```
|
||||
fn words(&self) -> Words<'a>;
|
||||
|
||||
/// Returns true if the string contains only whitespace.
|
||||
///
|
||||
/// Whitespace characters are determined by `char::is_whitespace`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!(" \t\n".is_whitespace());
|
||||
/// assert!("".is_whitespace());
|
||||
///
|
||||
/// assert!( !"abc".is_whitespace());
|
||||
/// ```
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Returns true if the string contains only alphanumeric code
|
||||
/// points.
|
||||
///
|
||||
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
|
||||
/// assert!("".is_alphanumeric());
|
||||
///
|
||||
/// assert!( !" &*~".is_alphanumeric());
|
||||
/// ```
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Returns the number of Unicode code points (`char`) that a
|
||||
/// string holds.
|
||||
///
|
||||
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
|
||||
/// Returns true if `needle` is a suffix of the string.
|
||||
fn ends_with(&self, needle: &str) -> bool;
|
||||
|
||||
/// Returns a string with leading and trailing whitespace removed.
|
||||
fn trim(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with leading whitespace removed.
|
||||
fn trim_left(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with trailing whitespace removed.
|
||||
fn trim_right(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with characters that match `to_trim` removed.
|
||||
///
|
||||
/// # Arguments
|
||||
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn words(&self) -> Words<'a> {
|
||||
self.split(char::is_whitespace).filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
|
||||
|
||||
#[inline]
|
||||
fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
|
||||
|
||||
#[inline]
|
||||
fn char_len(&self) -> uint { self.chars().count() }
|
||||
|
||||
@ -1814,21 +1748,6 @@ impl<'a> StrSlice<'a> for &'a str {
|
||||
m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim(&self) -> &'a str {
|
||||
self.trim_left().trim_right()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_left(&self) -> &'a str {
|
||||
self.trim_left_chars(char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_right(&self) -> &'a str {
|
||||
self.trim_right_chars(char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
|
||||
let cur = match self.find(|c: char| !to_trim.matches(c)) {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -200,3 +200,30 @@ fn test_encode_utf16() {
|
||||
check('\ua66e', [0xa66e]);
|
||||
check('\U0001f4a9', [0xd83d, 0xdca9]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_width() {
|
||||
assert_eq!('\x00'.width(false),Some(0));
|
||||
assert_eq!('\x00'.width(true),Some(0));
|
||||
|
||||
assert_eq!('\x0A'.width(false),None);
|
||||
assert_eq!('\x0A'.width(true),None);
|
||||
|
||||
assert_eq!('w'.width(false),Some(1));
|
||||
assert_eq!('w'.width(true),Some(1));
|
||||
|
||||
assert_eq!('h'.width(false),Some(2));
|
||||
assert_eq!('h'.width(true),Some(2));
|
||||
|
||||
assert_eq!('\xAD'.width(false),Some(1));
|
||||
assert_eq!('\xAD'.width(true),Some(1));
|
||||
|
||||
assert_eq!('\u1160'.width(false),Some(0));
|
||||
assert_eq!('\u1160'.width(true),Some(0));
|
||||
|
||||
assert_eq!('\u00a1'.width(false),Some(1));
|
||||
assert_eq!('\u00a1'.width(true),Some(2));
|
||||
|
||||
assert_eq!('\u0300'.width(false),Some(0));
|
||||
assert_eq!('\u0300'.width(true),Some(0));
|
||||
}
|
||||
|
@ -306,12 +306,15 @@
|
||||
//!
|
||||
//! ## Perl character classes (Unicode friendly)
|
||||
//!
|
||||
//! These classes are based on the definitions provided in
|
||||
//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! \d digit ([0-9] + \p{Nd})
|
||||
//! \d digit (\p{Nd})
|
||||
//! \D not digit
|
||||
//! \s whitespace ([\t\n\f\r ] + \p{Z})
|
||||
//! \s whitespace (\p{White_Space})
|
||||
//! \S not whitespace
|
||||
//! \w word character ([0-9A-Za-z_] + \p{L})
|
||||
//! \w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
|
||||
//! \W not word character
|
||||
//! </pre>
|
||||
//!
|
||||
@ -378,6 +381,9 @@ extern crate rand;
|
||||
#[cfg(test)]
|
||||
extern crate regex;
|
||||
|
||||
// unicode tables for character classes are defined in libunicode
|
||||
extern crate unicode;
|
||||
|
||||
pub use parse::Error;
|
||||
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
|
||||
pub use re::{FindCaptures, FindMatches};
|
||||
|
@ -16,9 +16,7 @@ use std::num;
|
||||
use std::str;
|
||||
|
||||
/// Static data containing Unicode ranges for general categories and scripts.
|
||||
use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
|
||||
#[allow(visible_private_types)]
|
||||
pub mod unicode;
|
||||
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
|
||||
|
||||
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
|
||||
static MAX_REPEAT: uint = 1000;
|
File diff suppressed because it is too large
Load Diff
@ -42,7 +42,7 @@ use compile::{
|
||||
Save, Jump, Split,
|
||||
};
|
||||
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
|
||||
use parse::unicode::PERLW;
|
||||
use unicode::regex::PERLW;
|
||||
|
||||
pub type CaptureLocs = Vec<Option<uint>>;
|
||||
|
||||
|
@ -237,6 +237,7 @@ use str::{Str, StrSlice, StrAllocating};
|
||||
use str;
|
||||
use string::String;
|
||||
use uint;
|
||||
use unicode::UnicodeChar;
|
||||
use vec::Vec;
|
||||
|
||||
// Reexports
|
||||
|
@ -126,6 +126,7 @@
|
||||
#[cfg(test)] #[phase(plugin, link)] extern crate log;
|
||||
|
||||
extern crate alloc;
|
||||
extern crate unicode;
|
||||
extern crate core;
|
||||
extern crate core_collections = "collections";
|
||||
extern crate core_rand = "rand";
|
||||
@ -148,7 +149,6 @@ extern crate rustrt;
|
||||
pub use core::any;
|
||||
pub use core::bool;
|
||||
pub use core::cell;
|
||||
pub use core::char;
|
||||
pub use core::clone;
|
||||
#[cfg(not(test))] pub use core::cmp;
|
||||
pub use core::default;
|
||||
@ -180,6 +180,8 @@ pub use core_collections::vec;
|
||||
pub use rustrt::c_str;
|
||||
pub use rustrt::local_data;
|
||||
|
||||
pub use unicode::char;
|
||||
|
||||
pub use core_sync::comm;
|
||||
|
||||
// Run tests with libgreen instead of libnative.
|
||||
|
@ -24,6 +24,7 @@ use option::{Option, Some, None};
|
||||
use slice::{Vector, ImmutableVector};
|
||||
use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
|
||||
use string::String;
|
||||
use unicode::UnicodeChar;
|
||||
use vec::Vec;
|
||||
|
||||
use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
|
||||
@ -997,7 +998,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
|
||||
let idx = path.find('\\');
|
||||
if idx == Some(2) && path.as_bytes()[1] == ':' as u8 {
|
||||
let c = path.as_bytes()[0];
|
||||
if c.is_ascii() && ::char::is_alphabetic(c as char) {
|
||||
if c.is_ascii() && (c as char).is_alphabetic() {
|
||||
// \\?\C:\ path
|
||||
return Some(VerbatimDiskPrefix);
|
||||
}
|
||||
@ -1021,7 +1022,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
|
||||
} else if path.len() > 1 && path.as_bytes()[1] == ':' as u8 {
|
||||
// C:
|
||||
let c = path.as_bytes()[0];
|
||||
if c.is_ascii() && ::char::is_alphabetic(c as char) {
|
||||
if c.is_ascii() && (c as char).is_alphabetic() {
|
||||
return Some(DiskPrefix);
|
||||
}
|
||||
}
|
||||
|
@ -88,6 +88,7 @@
|
||||
#[doc(no_inline)] pub use slice::{Vector, VectorVector};
|
||||
#[doc(no_inline)] pub use slice::MutableVectorAllocating;
|
||||
#[doc(no_inline)] pub use string::String;
|
||||
#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
|
||||
#[doc(no_inline)] pub use vec::Vec;
|
||||
|
||||
// Reexported runtime types
|
||||
|
@ -12,7 +12,6 @@
|
||||
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
use char::Char;
|
||||
use collections::Collection;
|
||||
use from_str::from_str;
|
||||
use io::{IoResult, Writer};
|
||||
@ -22,6 +21,7 @@ use os;
|
||||
use result::{Ok, Err};
|
||||
use str::StrSlice;
|
||||
use sync::atomics;
|
||||
use unicode::UnicodeChar;
|
||||
|
||||
pub use self::imp::write;
|
||||
|
||||
|
111
src/libunicode/decompose.rs
Normal file
111
src/libunicode/decompose.rs
Normal file
@ -0,0 +1,111 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
Functions for computing canonical and compatible decompositions
|
||||
for Unicode characters.
|
||||
*/
|
||||
|
||||
use core::option::{Option, Some, None};
|
||||
use core::slice::ImmutableVector;
|
||||
use tables::normalization::{canonical_table, compatibility_table};
|
||||
|
||||
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
|
||||
use core::cmp::{Equal, Less, Greater};
|
||||
match r.bsearch(|&(val, _)| {
|
||||
if c == val { Equal }
|
||||
else if val < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, result) = r[idx];
|
||||
Some(result)
|
||||
}
|
||||
None => None
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute canonical Unicode decomposition for character
|
||||
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
|
||||
|
||||
/// Compute canonical or compatible Unicode decomposition for character
|
||||
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
|
||||
|
||||
fn d(c: char, i: |char|, k: bool) {
|
||||
use core::iter::Iterator;
|
||||
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' { i(c); return; }
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
|
||||
decompose_hangul(c, i);
|
||||
return;
|
||||
}
|
||||
|
||||
// First check the canonical decompositions
|
||||
match bsearch_table(c, canonical_table) {
|
||||
Some(canon) => {
|
||||
for x in canon.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Bottom out if we're not doing compat.
|
||||
if !k { i(c); return; }
|
||||
|
||||
// Then check the compatibility decompositions
|
||||
match bsearch_table(c, compatibility_table) {
|
||||
Some(compat) => {
|
||||
for x in compat.iter() {
|
||||
d(*x, |b| i(b), k);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => ()
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
i(c);
|
||||
}
|
||||
|
||||
// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
|
||||
static S_BASE: u32 = 0xAC00;
|
||||
static L_BASE: u32 = 0x1100;
|
||||
static V_BASE: u32 = 0x1161;
|
||||
static T_BASE: u32 = 0x11A7;
|
||||
static L_COUNT: u32 = 19;
|
||||
static V_COUNT: u32 = 21;
|
||||
static T_COUNT: u32 = 28;
|
||||
static N_COUNT: u32 = (V_COUNT * T_COUNT);
|
||||
static S_COUNT: u32 = (L_COUNT * N_COUNT);
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
fn decompose_hangul(s: char, f: |char|) {
|
||||
use core::mem::transmute;
|
||||
|
||||
let si = s as u32 - S_BASE;
|
||||
|
||||
let li = si / N_COUNT;
|
||||
unsafe {
|
||||
f(transmute(L_BASE + li));
|
||||
|
||||
let vi = (si % N_COUNT) / T_COUNT;
|
||||
f(transmute(V_BASE + vi));
|
||||
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
f(transmute(T_BASE + ti));
|
||||
}
|
||||
}
|
||||
}
|
77
src/libunicode/lib.rs
Normal file
77
src/libunicode/lib.rs
Normal file
@ -0,0 +1,77 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! # The Unicode Library
|
||||
//!
|
||||
//! Unicode-intensive functions for `char` and `str` types.
|
||||
//!
|
||||
//! This crate provides a collection of Unicode-related functionality,
|
||||
//! including decompositions, conversions, etc., and provides traits
|
||||
//! implementing these functions for the `char` and `str` types.
|
||||
//!
|
||||
//! The functionality included here is only that which is necessary to
|
||||
//! provide for basic string-related manipulations. This crate does not
|
||||
//! (yet) aim to provide a full set of Unicode tables.
|
||||
|
||||
#![crate_id = "unicode#0.11.0"]
|
||||
#![crate_name = "unicode"]
|
||||
#![experimental]
|
||||
#![license = "MIT/ASL2"]
|
||||
#![crate_type = "rlib"]
|
||||
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
|
||||
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
|
||||
html_root_url = "http://doc.rust-lang.org/",
|
||||
html_playground_url = "http://play.rust-lang.org/")]
|
||||
#![no_std]
|
||||
#![allow(unused_attribute)] // NOTE: remove after stage0
|
||||
|
||||
extern crate core;
|
||||
|
||||
pub use tables::normalization::canonical_combining_class;
|
||||
pub use tables::regex;
|
||||
|
||||
pub use u_char::UnicodeChar;
|
||||
pub use u_str::UnicodeStrSlice;
|
||||
pub use u_str::Words;
|
||||
|
||||
mod decompose;
|
||||
mod tables;
|
||||
mod u_char;
|
||||
mod u_str;
|
||||
|
||||
// re-export char so that std et al see it correctly
|
||||
/// Character manipulation (`char` type, Unicode Scalar Value)
|
||||
///
|
||||
/// This module provides the `Char` and `UnicodeChar` traits, as well as their
|
||||
/// implementation for the primitive `char` type, in order to allow basic character
|
||||
/// manipulation.
|
||||
///
|
||||
/// A `char` actually represents a
|
||||
/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
|
||||
/// as it can contain any Unicode code point except high-surrogate and
|
||||
/// low-surrogate code points.
|
||||
///
|
||||
/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
|
||||
/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
|
||||
/// however the converse is not always true due to the above range limits
|
||||
/// and, as such, should be performed via the `from_u32` function..
|
||||
pub mod char {
|
||||
pub use core::char::{MAX, from_u32, is_digit_radix, to_digit};
|
||||
pub use core::char::{from_digit, escape_unicode, escape_default};
|
||||
pub use core::char::{len_utf8_bytes, Char};
|
||||
|
||||
pub use decompose::decompose_canonical;
|
||||
pub use decompose::decompose_compatible;
|
||||
|
||||
pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
|
||||
pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
|
||||
pub use u_char::{is_alphanumeric, is_control, is_digit};
|
||||
pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
|
||||
}
|
6445
src/libunicode/tables.rs
Normal file
6445
src/libunicode/tables.rs
Normal file
File diff suppressed because it is too large
Load Diff
266
src/libunicode/u_char.rs
Normal file
266
src/libunicode/u_char.rs
Normal file
@ -0,0 +1,266 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
* Unicode-intensive `char` methods.
|
||||
*
|
||||
* These methods implement functionality for `char` that requires knowledge of
|
||||
* Unicode definitions, including normalization, categorization, and display information.
|
||||
*/
|
||||
|
||||
use core::option::Option;
|
||||
use tables::{derived_property, property, general_category, conversions, charwidth};
|
||||
|
||||
/// Returns whether the specified `char` is considered a Unicode alphabetic
|
||||
/// code point
|
||||
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in lower case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is in upper case
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is whitespace
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// As an optimization ASCII whitespace characters are checked separately
|
||||
c == ' '
|
||||
|| ('\x09' <= c && c <= '\x0d')
|
||||
|| property::White_Space(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is alphanumeric
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_alphanumeric(c: char) -> bool {
|
||||
derived_property::Alphabetic(c)
|
||||
|| general_category::N(c)
|
||||
}
|
||||
|
||||
///
|
||||
/// Indicates whether a `char` is a control code point
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General Category
|
||||
/// 'Cc'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
|
||||
|
||||
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
|
||||
#[inline]
|
||||
pub fn is_digit(c: char) -> bool {
|
||||
general_category::N(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its uppercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping:
|
||||
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
|
||||
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here
|
||||
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion was made
|
||||
#[inline]
|
||||
pub fn to_uppercase(c: char) -> char {
|
||||
conversions::to_upper(c)
|
||||
}
|
||||
|
||||
/// Convert a char to its lowercase equivalent
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping
|
||||
/// see `to_uppercase` for references and more information
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the char itself if no conversion if possible
|
||||
#[inline]
|
||||
pub fn to_lowercase(c: char) -> char {
|
||||
conversions::to_lower(c)
|
||||
}
|
||||
|
||||
/// Returns this character's displayed width in columns, or `None` if it is a
|
||||
/// control character other than `'\x00'`.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the context cannot be reliably determined.
|
||||
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
|
||||
charwidth::width(c, is_cjk)
|
||||
}
|
||||
|
||||
/// Useful functions for Unicode characters.
|
||||
pub trait UnicodeChar {
|
||||
/// Returns whether the specified character is considered a Unicode
|
||||
/// alphabetic code point.
|
||||
fn is_alphabetic(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified character satisfies the 'XID_Start'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Start' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to ID_Start but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
fn is_XID_start(&self) -> bool;
|
||||
|
||||
/// Returns whether the specified `char` satisfies the 'XID_Continue'
|
||||
/// Unicode property.
|
||||
///
|
||||
/// 'XID_Continue' is a Unicode Derived Property specified in
|
||||
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
|
||||
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
|
||||
#[allow(non_snake_case_functions)]
|
||||
fn is_XID_continue(&self) -> bool;
|
||||
|
||||
|
||||
/// Indicates whether a character is in lowercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Lowercase`.
|
||||
fn is_lowercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is in uppercase.
|
||||
///
|
||||
/// This is defined according to the terms of the Unicode Derived Core
|
||||
/// Property `Uppercase`.
|
||||
fn is_uppercase(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is whitespace.
|
||||
///
|
||||
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is alphanumeric.
|
||||
///
|
||||
/// Alphanumericness is defined in terms of the Unicode General Categories
|
||||
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Indicates whether a character is a control code point.
|
||||
///
|
||||
/// Control code points are defined in terms of the Unicode General
|
||||
/// Category `Cc`.
|
||||
fn is_control(&self) -> bool;
|
||||
|
||||
/// Indicates whether the character is numeric (Nd, Nl, or No).
|
||||
fn is_digit(&self) -> bool;
|
||||
|
||||
/// Converts a character to its lowercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping. See
|
||||
/// `to_uppercase()` for references and more information.
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the lowercase equivalent of the character, or the character
|
||||
/// itself if no conversion is possible.
|
||||
fn to_lowercase(&self) -> char;
|
||||
|
||||
/// Converts a character to its uppercase equivalent.
|
||||
///
|
||||
/// The case-folding performed is the common or simple mapping: it maps
|
||||
/// one unicode codepoint (one character in Rust) to its uppercase
|
||||
/// equivalent according to the Unicode database [1]. The additional
|
||||
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
|
||||
/// codepoints in some cases.
|
||||
///
|
||||
/// A full reference can be found here [2].
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// Returns the uppercase equivalent of the character, or the character
|
||||
/// itself if no conversion was made.
|
||||
///
|
||||
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
///
|
||||
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
||||
fn to_uppercase(&self) -> char;
|
||||
|
||||
/// Returns this character's displayed width in columns, or `None` if it is a
|
||||
/// control character other than `'\x00'`.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the context cannot be reliably determined.
|
||||
fn width(&self, is_cjk: bool) -> Option<uint>;
|
||||
}
|
||||
|
||||
impl UnicodeChar for char {
|
||||
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
|
||||
|
||||
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
|
||||
|
||||
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
|
||||
|
||||
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
|
||||
|
||||
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
|
||||
|
||||
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
|
||||
|
||||
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
|
||||
|
||||
fn is_control(&self) -> bool { is_control(*self) }
|
||||
|
||||
fn is_digit(&self) -> bool { is_digit(*self) }
|
||||
|
||||
fn to_lowercase(&self) -> char { to_lowercase(*self) }
|
||||
|
||||
fn to_uppercase(&self) -> char { to_uppercase(*self) }
|
||||
|
||||
fn width(&self, is_cjk: bool) -> Option<uint> { width(*self, is_cjk) }
|
||||
}
|
119
src/libunicode/u_str.rs
Normal file
119
src/libunicode/u_str.rs
Normal file
@ -0,0 +1,119 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
/*!
|
||||
* Unicode-intensive string manipulations.
|
||||
*
|
||||
* This module provides functionality to `str` that requires the Unicode
|
||||
* methods provided by the UnicodeChar trait.
|
||||
*/
|
||||
|
||||
use core::collections::Collection;
|
||||
use core::iter::{Filter};
|
||||
use core::str::{CharSplits, StrSlice};
|
||||
use core::iter::Iterator;
|
||||
use u_char;
|
||||
|
||||
/// An iterator over the words of a string, separated by a sequence of whitespace
|
||||
pub type Words<'a> =
|
||||
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
|
||||
|
||||
/// Methods for Unicode string slices
|
||||
pub trait UnicodeStrSlice<'a> {
|
||||
/// An iterator over the words of a string (subsequences separated
|
||||
/// by any sequence of whitespace). Sequences of whitespace are
|
||||
/// collapsed, so empty "words" are not included.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// let some_words = " Mary had\ta little \n\t lamb";
|
||||
/// let v: Vec<&str> = some_words.words().collect();
|
||||
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
|
||||
/// ```
|
||||
fn words(&self) -> Words<'a>;
|
||||
|
||||
/// Returns true if the string contains only whitespace.
|
||||
///
|
||||
/// Whitespace characters are determined by `char::is_whitespace`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!(" \t\n".is_whitespace());
|
||||
/// assert!("".is_whitespace());
|
||||
///
|
||||
/// assert!( !"abc".is_whitespace());
|
||||
/// ```
|
||||
fn is_whitespace(&self) -> bool;
|
||||
|
||||
/// Returns true if the string contains only alphanumeric code
|
||||
/// points.
|
||||
///
|
||||
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
|
||||
/// assert!("".is_alphanumeric());
|
||||
///
|
||||
/// assert!( !" &*~".is_alphanumeric());
|
||||
/// ```
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
|
||||
/// Returns a string's displayed width in columns, treating control
|
||||
/// characters as zero-width.
|
||||
///
|
||||
/// `is_cjk` determines behavior for characters in the Ambiguous category:
|
||||
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
|
||||
/// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
|
||||
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
|
||||
/// recommends that these characters be treated as 1 column (i.e.,
|
||||
/// `is_cjk` = `false`) if the locale is unknown.
|
||||
//fn width(&self, is_cjk: bool) -> uint;
|
||||
|
||||
/// Returns a string with leading and trailing whitespace removed.
|
||||
fn trim(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with leading whitespace removed.
|
||||
fn trim_left(&self) -> &'a str;
|
||||
|
||||
/// Returns a string with trailing whitespace removed.
|
||||
fn trim_right(&self) -> &'a str;
|
||||
}
|
||||
|
||||
impl<'a> UnicodeStrSlice<'a> for &'a str {
|
||||
#[inline]
|
||||
fn words(&self) -> Words<'a> {
|
||||
self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
|
||||
|
||||
#[inline]
|
||||
fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
|
||||
|
||||
#[inline]
|
||||
fn trim(&self) -> &'a str {
|
||||
self.trim_left().trim_right()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_left(&self) -> &'a str {
|
||||
self.trim_left_chars(u_char::is_whitespace)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trim_right(&self) -> &'a str {
|
||||
self.trim_right_chars(u_char::is_whitespace)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user