Add libunicode; move unicode functions from core

- created new crate, libunicode, below libstd
- split Char trait into Char (libcore) and UnicodeChar (libunicode)
  - Unicode-aware functions now live in libunicode
    - is_alphabetic, is_XID_start, is_XID_continue, is_lowercase,
      is_uppercase, is_whitespace, is_alphanumeric, is_control,
      is_digit, to_uppercase, to_lowercase
  - added width method in UnicodeChar trait
    - determines printed width of character in columns, or None if it is
      a non-NULL control character
    - takes a boolean argument indicating whether the present context is
      CJK or not (characters with 'A'mbiguous widths are double-wide in
      CJK contexts, single-wide otherwise)
- split StrSlice into StrSlice (libcore) and UnicodeStrSlice
  (libunicode)
  - functionality formerly in StrSlice that relied upon Unicode
    functionality from Char is now in UnicodeStrSlice
    - words, is_whitespace, is_alphanumeric, trim, trim_left, trim_right
  - also moved Words type alias into libunicode because words method is
    in UnicodeStrSlice
- unified Unicode tables from libcollections, libcore, and libregex into
  libunicode
- updated unicode.py in src/etc to generate aforementioned tables
- generated new tables based on latest Unicode data
- added UnicodeChar and UnicodeStrSlice traits to prelude
- libunicode is now the collection point for the std::char module,
  combining the libunicode functionality with the Char functionality
  from libcore
  - thus, moved doc comment for char from core::char to unicode::char
- libcollections remains the collection point for std::str

The Unicode-aware functions that previously lived in the Char and
StrSlice traits are no longer available to programs that only use
libcore. To regain use of these methods, include the libunicode crate
and use the UnicodeChar and/or UnicodeStrSlice traits:

    extern crate unicode;
    use unicode::UnicodeChar;
    use unicode::UnicodeStrSlice;
    use unicode::Words; // if you want to use the words() method

NOTE: this does *not* impact programs that use libstd, since UnicodeChar
and UnicodeStrSlice have been added to the prelude.

closes #15224
[breaking-change]
This commit is contained in:
kwantam 2014-06-30 17:04:10 -04:00
parent 4f120e6baf
commit 5d4238b6fc
25 changed files with 7439 additions and 11593 deletions

View File

@ -51,17 +51,19 @@
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
uuid serialize sync getopts collections num test time rand \
url log regex graphviz core rlibc alloc debug rustrt
url log regex graphviz core rlibc alloc debug rustrt \
unicode
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
TOOLS := compiletest rustdoc rustc
DEPS_core :=
DEPS_rlibc :=
DEPS_unicode := core
DEPS_alloc := core libc native:jemalloc
DEPS_debug := std
DEPS_rustrt := alloc core libc collections native:rustrt_native
DEPS_std := core libc rand alloc collections rustrt sync \
DEPS_std := core libc rand alloc collections rustrt sync unicode \
native:rust_builtin native:backtrace
DEPS_graphviz := std
DEPS_green := std native:context_switch
@ -82,7 +84,7 @@ DEPS_semver := std
DEPS_uuid := std serialize
DEPS_sync := core alloc rustrt collections
DEPS_getopts := std
DEPS_collections := core alloc
DEPS_collections := core alloc unicode
DEPS_fourcc := rustc syntax std
DEPS_hexfloat := rustc syntax std
DEPS_num := std
@ -108,6 +110,7 @@ ONLY_RLIB_rlibc := 1
ONLY_RLIB_alloc := 1
ONLY_RLIB_rand := 1
ONLY_RLIB_collections := 1
ONLY_RLIB_unicode := 1
################################################################################
# You should not need to edit below this line

View File

@ -1,183 +0,0 @@
#!/usr/bin/env python2
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
from __future__ import absolute_import, division, print_function
import argparse
from collections import defaultdict
import csv
import datetime
import urllib2
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
DATA = 'UnicodeData.txt'
SCRIPTS = 'Scripts.txt'
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
def as_4byte_uni(n):
s = hex(n)[2:]
return '\\U%s%s' % ('0' * (8 - len(s)), s)
def expand_cat(c):
return expanded_categories.get(c, []) + [c]
def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
def read_cats(f):
assigned = defaultdict(list)
for row in csv.reader(f, delimiter=';'):
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
if not is_valid_unicode(hex):
continue
for cat in cats:
assigned[cat].append(hex)
return assigned
def read_scripts(f):
assigned = defaultdict(list)
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
hexes, name = map(str.strip, line.split(';'))[:2]
name = name[:name.index('#')].strip()
if '..' not in hexes:
hex = int(hexes, 16)
if is_valid_unicode(hex):
assigned[name].append(hex)
else:
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
for hex in xrange(hex1, hex2 + 1):
if is_valid_unicode(hex):
assigned[name].append(hex)
return assigned
def group(letters):
letters = sorted(set(letters))
grouped = []
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
grouped.append((cur_start, cur_end))
cur_start, cur_end = letter, letter
grouped.append((cur_start, cur_end))
return grouped
def ranges_to_rust(rs):
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
return ',\n '.join(rs)
def groups_to_rust(groups):
rust_groups = []
for group_name in sorted(groups):
rust_groups.append('("%s", &[\n %s\n ]),'
% (group_name, ranges_to_rust(groups[group_name])))
return '\n'.join(rust_groups)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate Unicode character class tables.')
aa = parser.add_argument
aa('--local', action='store_true',
help='When set, Scripts.txt and UnicodeData.txt will be read from '
'the CWD.')
aa('--base-url', type=str, default=BASE_URL,
help='The base URL to use for downloading Unicode data files.')
args = parser.parse_args()
if args.local:
cats = read_cats(open(DATA))
scripts = read_scripts(open(SCRIPTS))
else:
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
# Get Rust code for all Unicode general categories and scripts.
combined = dict(cats, **scripts)
unigroups = groups_to_rust({k: group(letters)
for k, letters in combined.items()})
# Now get Perl character classes that are Unicode friendly.
perld = range(ord('0'), ord('9') + 1)
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
perlw = [ord('_')] + perld + low + up
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
// on {date}.
use parse::{{Class, NamedClasses}};
pub static UNICODE_CLASSES: NamedClasses = &[
{groups}
];
pub static PERLD: Class = &[
{dgroups}
];
pub static PERLS: Class = &[
{sgroups}
];
pub static PERLW: Class = &[
{wgroups}
];
'''
now = datetime.datetime.now()
print(tpl.format(date=str(now), groups=unigroups,
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))

View File

@ -10,17 +10,46 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
# code covering the core properties. Since this is a pretty rare event we
# just store this out-of-line and check the unicode.rs file into git.
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - EastAsianWidth.txt
# - PropList.txt
# - Scripts.txt
# - UnicodeData.txt
#
# The emitted code is "the minimum we think is necessary for libstd", that
# is, to support basic operations of the compiler and "most nontrivial rust
# programs". It is not meant to be a complete implementation of unicode.
# For that we recommend you use a proper binding to libicu.
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
import fileinput, re, os, sys, operator
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
#![allow(missing_doc, non_uppercase_statics, non_snake_case_functions)]
'''
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
def fetch(f):
if not os.path.exists(f):
@ -31,21 +60,17 @@ def fetch(f):
sys.stderr.write("cannot load %s" % f)
exit(1)
def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
def load_unicode_data(f):
fetch(f)
gencats = {}
upperlower = {}
lowerupper = {}
combines = []
combines = {}
canon_decomp = {}
compat_decomp = {}
curr_cat = ""
curr_combine = ""
c_lo = 0
c_hi = 0
com_lo = 0
com_hi = 0
for line in fileinput.input(f):
fields = line.split(";")
@ -58,6 +83,9 @@ def load_unicode_data(f):
code_org = code
code = int(code, 16)
if not is_valid_unicode(code):
continue
# generate char to char direct common and simple conversions
# uppercase to lowercase
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
@ -67,6 +95,7 @@ def load_unicode_data(f):
if gencat == "Ll" and upcase != "" and code_org != upcase:
lowerupper[code] = int(upcase, 16)
# store decomposition, if given
if decomp != "":
if decomp.startswith('<'):
seq = []
@ -79,38 +108,76 @@ def load_unicode_data(f):
seq.append(int(i, 16))
canon_decomp[code] = seq
if curr_cat == "":
curr_cat = gencat
c_lo = code
c_hi = code
# place letter in categories as appropriate
for cat in [gencat] + expanded_categories.get(gencat, []):
if cat not in gencats:
gencats[cat] = []
gencats[cat].append(code)
if curr_cat == gencat:
c_hi = code
else:
if curr_cat not in gencats:
gencats[curr_cat] = []
# record combining class, if any
if combine != "0":
if combine not in combines:
combines[combine] = []
combines[combine].append(code)
gencats[curr_cat].append((c_lo, c_hi))
curr_cat = gencat
c_lo = code
c_hi = code
if curr_combine == "":
curr_combine = combine
com_lo = code
com_hi = code
if curr_combine == combine:
com_hi = code
else:
if curr_combine != "0":
combines.append((com_lo, com_hi, curr_combine))
curr_combine = combine
com_lo = code
com_hi = code
gencats = group_cats(gencats)
combines = to_combines(group_cats(combines))
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
def group_cats(cats):
cats_out = {}
for cat in cats:
cats_out[cat] = group_cat(cats[cat])
return cats_out
def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out
def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out
def to_combines(combs):
combs_out = []
for comb in combs:
for (lo, hi) in combs[comb]:
combs_out.append((lo, hi, comb))
combs_out.sort(key=lambda comb: comb[0])
return combs_out
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def load_properties(f, interestingprops):
fetch(f)
props = {}
@ -134,7 +201,7 @@ def load_properties(f, interestingprops):
prop = m.group(3)
else:
continue
if prop not in interestingprops:
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
@ -143,6 +210,43 @@ def load_properties(f, interestingprops):
props[prop].append((d_lo, d_hi))
return props
# load all widths of want_widths, except those in except_cats
def load_east_asian_width(want_widths, except_cats):
f = "EastAsianWidth.txt"
fetch(f)
widths = {}
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
for line in fileinput.input(f):
width = None
d_lo = 0
d_hi = 0
cat = None
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
width = m.group(2)
cat = m.group(3)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
width = m.group(3)
cat = m.group(4)
else:
continue
if cat in except_cats or width not in want_widths:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if width not in widths:
widths[width] = []
widths[width].append((d_lo, d_hi))
return widths
def escape_char(c):
if c <= 0xff:
return "'\\x%2.2x'" % c
@ -150,59 +254,72 @@ def escape_char(c):
return "'\\u%4.4x'" % c
return "'\\U%8.8x'" % c
def ch_prefix(ix):
if ix == 0:
return " "
if ix % 2 == 0:
return ",\n "
else:
return ", "
def emit_bsearch_range_table(f):
f.write("""
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use cmp::{Equal, Less, Greater};
use slice::ImmutableVector;
use option::None;
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
use core::option::None;
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) != None
}\n
""");
""")
def emit_property_module(f, mod, tbl):
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
pub_string = ""
if is_pub:
pub_string = "pub "
f.write(" %sstatic %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
if not first:
data += ","
first = False
data += pfun(dat)
format_table_content(f, data, 8)
f.write("\n ];\n\n")
def emit_property_module(f, mod, tbl, emit_fn):
f.write("pub mod %s {\n" % mod)
keys = tbl.keys()
keys.sort()
for cat in keys:
if cat not in ["Nd", "Nl", "No", "Cc",
"XID_Start", "XID_Continue", "Alphabetic",
"Lowercase", "Uppercase", "White_Space"]:
continue
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
ix = 0
for pair in tbl[cat]:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(pair[0]), escape_char(pair[1])))
ix += 1
f.write("\n ];\n\n")
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
emit_table(f, "%s_table" % cat, tbl[cat])
if cat in emit_fn:
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")
def emit_regex_module(f, cats, w_data):
f.write("pub mod regex {\n")
regex_class = "&'static [(char, char)]"
class_table = "&'static [(&'static str, %s)]" % regex_class
emit_table(f, "UNICODE_CLASSES", cats, class_table,
pfun=lambda x: "(\"%s\",super::%s::%s_table)" % (x[0], x[1], x[0]))
f.write(" pub static PERLD: %s = super::general_category::Nd_table;\n\n"
% regex_class)
f.write(" pub static PERLS: %s = super::property::White_Space_table;\n\n"
% regex_class)
emit_table(f, "PERLW", w_data, regex_class)
f.write("}\n\n")
def emit_conversions_module(f, lowerupper, upperlower):
f.write("pub mod conversions {")
f.write("""
use cmp::{Equal, Less, Greater};
use slice::ImmutableVector;
use tuple::Tuple2;
use option::{Option, Some, None};
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
use core::tuple::Tuple2;
use core::option::{Option, Some, None};
pub fn to_lower(c: char) -> char {
match bsearch_case_table(c, LuLl_table) {
@ -226,189 +343,88 @@ def emit_conversions_module(f, lowerupper, upperlower):
})
}
""");
emit_caseconversion_table(f, "LuLl", upperlower)
emit_caseconversion_table(f, "LlLu", lowerupper)
""")
emit_table(f, "LuLl_table",
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
emit_table(f, "LlLu_table",
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
f.write("}\n\n")
def emit_charwidth_module(f, width_table):
f.write("pub mod charwidth {\n")
f.write(" use core::option::{Option, Some, None};\n")
f.write(" use core::slice::ImmutableVector;\n")
f.write("""
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
use core::cmp::{Equal, Less, Greater};
match r.bsearch(|&(lo, hi, _, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, r_ncjk, r_cjk) = r[idx];
if is_cjk { r_cjk } else { r_ncjk }
}
None => 1
}
}
""")
f.write("""
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
match c as uint {
_c @ 0 => Some(0), // null is zero width
cu if cu < 0x20 => None, // control sequences have no width
cu if cu < 0x7F => Some(1), // ASCII
cu if cu < 0xA0 => None, // more control sequences
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as uint)
}
}
""")
f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
f.write("}\n")
def emit_caseconversion_table(f, name, table):
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
ix = 0
for key, value in sorted_table:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
ix += 1
f.write("\n ];\n\n")
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def emit_core_norm_module(f, canon, compat):
def emit_norm_module(f, canon, compat, combine):
canon_keys = canon.keys()
canon_keys.sort()
compat_keys = compat.keys()
compat_keys.sort()
f.write("pub mod normalization {\n");
f.write(" use option::Option;\n");
f.write(" use option::{Some, None};\n");
f.write(" use slice::ImmutableVector;\n");
f.write("""
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
use cmp::{Equal, Less, Greater};
match r.bsearch(|&(val, _)| {
if c == val { Equal }
else if val < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, result) = r[idx];
Some(result)
}
None => None
}
}\n\n
""")
f.write("pub mod normalization {\n")
def mkdata_fun(table):
def f(char):
data = "(%s,&[" % escape_char(char)
first = True
for d in table[char]:
if not first:
data += ","
first = False
data += escape_char(d)
data += "])"
return data
return f
f.write(" // Canonical decompositions\n")
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in canon_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in canon[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
pfun=mkdata_fun(canon))
f.write(" // Compatibility decompositions\n")
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in compat_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in compat[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write("""
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
fn d(c: char, i: |char|, k: bool) {
use iter::Iterator;
// 7-bit ASCII never decomposes
if c <= '\\x7f' { i(c); return; }
// Perform decomposition for Hangul
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
decompose_hangul(c, i);
return;
}
// First check the canonical decompositions
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Bottom out if we're not doing compat.
if !k { i(c); return; }
// Then check the compatibility decompositions
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Finally bottom out.
i(c);
}
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
use cast::transmute;
let si = s as u32 - S_BASE;
let li = si / N_COUNT;
unsafe {
f(transmute(L_BASE + li));
let vi = (si % N_COUNT) / T_COUNT;
f(transmute(V_BASE + vi));
let ti = si % T_COUNT;
if ti > 0 {
f(transmute(T_BASE + ti));
}
}
}
}
""")
def emit_std_norm_module(f, combine):
f.write("pub mod normalization {\n");
f.write(" use option::{Some, None};\n");
f.write(" use slice::ImmutableVector;\n");
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
pfun=mkdata_fun(compat))
f.write("""
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
use cmp::{Equal, Less, Greater};
use core::option::{Some, None};
use core::cmp::{Equal, Less, Greater};
use core::slice::ImmutableVector;
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
@ -420,72 +436,122 @@ def emit_std_norm_module(f, combine):
}
None => 0
}
}\n\n
}\n
""")
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
ix = 0
for pair in combine:
f.write(ch_prefix(ix))
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
ix += 1
f.write("\n ];\n\n")
emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n")
f.write("}\n")
f.write("""
}
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
""")
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
def remove_from_wtable(wtable, val):
wtable_out = []
while wtable:
if wtable[0][1] < val:
wtable_out.append(wtable.pop(0))
elif wtable[0][0] > val:
break
else:
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
if wt_lo == wt_hi == val:
continue
elif wt_lo == val:
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
elif wt_hi == val:
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
else:
wtable_out.append((wt_lo, val-1, width, width_cjk))
wtable_out.append((val+1, wt_hi, width, width_cjk))
if wtable:
wtable_out.extend(wtable)
return wtable_out
#![allow(missing_doc, non_uppercase_statics)]
def optimize_width_table(wtable):
wtable_out = []
w_this = wtable.pop(0)
while wtable:
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
w_tmp = wtable.pop(0)
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
else:
wtable_out.append(w_this)
w_this = wtable.pop(0)
wtable_out.append(w_this)
return wtable_out
'''
(canon_decomp, compat_decomp, gencats,
combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
def gen_core_unicode():
r = "core_unicode.rs"
if __name__ == "__main__":
r = "unicode.rs"
if os.path.exists(r):
os.remove(r);
os.remove(r)
with open(r, "w") as rf:
# Preamble
# write the file's preamble
rf.write(preamble)
emit_bsearch_range_table(rf);
emit_property_module(rf, "general_category", gencats)
# download and parse all the data
(canon_decomp, compat_decomp, gencats, combines,
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
other_derived = ["Default_Ignorable_Code_Point"]
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
emit_core_norm_module(rf, canon_decomp, compat_decomp)
# bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
# all of these categories will also be available as \p{} in libregex
allcats = []
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
("derived_property", derived, want_derived), \
("script", scripts, []), \
("property", props, ["White_Space"]):
emit_property_module(rf, name, cat, pfuns)
allcats.extend(map(lambda x: (x, name), cat))
allcats.sort(key=lambda c: c[0])
emit_property_module(rf, "derived_property", derived)
# the \w regex corresponds to Alphabetic + Mark + Decimal_Number +
# Connector_Punctuation + Join-Control according to UTS#18
# http://www.unicode.org/reports/tr18/#Compatibility_Properties
perl_words = []
for cat in derived["Alphabetic"], gencats["M"], gencats["Nd"], \
gencats["Pc"], props["Join_Control"]:
perl_words.extend(ungroup_cat(cat))
perl_words = group_cat(perl_words)
props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
emit_regex_module(rf, allcats, perl_words)
# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
emit_conversions_module(rf, lowerupper, upperlower)
def gen_std_unicode():
r = "std_unicode.rs"
if os.path.exists(r):
os.remove(r);
with open(r, "w") as rf:
# Preamble
rf.write(preamble)
emit_std_norm_module(rf, combines)
# character width module
width_table = []
for zwcat in ["Me", "Mn", "Cf"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
width_table.append((4448, 4607, 0, 0))
gen_core_unicode()
gen_std_unicode()
# get widths, except those that are explicitly marked zero-width above
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
# these are doublewidth
for dwcat in ["W", "F"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
width_table.sort(key=lambda w: w[0])
# soft hyphen is not zero width in preformatted text; it's used to indicate
# a hyphen inserted to facilitate a linebreak.
width_table = remove_from_wtable(width_table, 173)
# optimize the width table by collapsing adjacent entities when possible
width_table = optimize_width_table(width_table)
emit_charwidth_module(rf, width_table)

View File

@ -28,6 +28,7 @@
#![allow(unused_attribute)] // NOTE: remove after stage0
#[phase(plugin, link)] extern crate core;
extern crate unicode;
extern crate alloc;
#[cfg(test)] extern crate native;
@ -69,9 +70,6 @@ pub mod string;
pub mod vec;
pub mod hash;
// Internal unicode fiddly bits for the str module
mod unicode;
mod deque;
/// A trait to represent mutable containers

View File

@ -69,7 +69,6 @@ is the same as `&[u8]`.
use core::prelude::*;
use core::char;
use core::default::Default;
use core::fmt;
use core::cmp;
@ -79,15 +78,17 @@ use core::mem;
use Collection;
use hash;
use string::String;
use unicode;
use vec::Vec;
pub use core::str::{from_utf8, CharEq, Chars, CharOffsets};
pub use core::str::{Bytes, CharSplits};
pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits};
pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits};
pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items};
pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items};
pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange};
pub use core::str::{Str, StrSlice};
pub use unicode::{Words, UnicodeStrSlice};
/*
Section: Creating a string
@ -283,7 +284,7 @@ pub struct Decompositions<'a> {
impl<'a> Iterator<char> for Decompositions<'a> {
#[inline]
fn next(&mut self) -> Option<char> {
use unicode::normalization::canonical_combining_class;
use unicode::canonical_combining_class;
match self.buffer.as_slice().head() {
Some(&(c, 0)) => {
@ -299,8 +300,8 @@ impl<'a> Iterator<char> for Decompositions<'a> {
}
let decomposer = match self.kind {
Canonical => char::decompose_canonical,
Compatible => char::decompose_compatible
Canonical => unicode::char::decompose_canonical,
Compatible => unicode::char::decompose_compatible
};
if !self.sorted {

View File

@ -1,183 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
#![allow(missing_doc, non_uppercase_statics)]
pub mod normalization {
use core::prelude::*;
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 0
}
}
static combining_class_table : &'static [(char, char, u8)] = &[
('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232),
('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232),
('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220),
('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220),
('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220),
('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220),
('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240),
('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220),
('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220),
('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220),
('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232),
('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230),
('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234),
('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234),
('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230),
('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220),
('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220),
('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222),
('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230),
('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230),
('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230),
('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228),
('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10),
('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12),
('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14),
('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16),
('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18),
('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20),
('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22),
('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24),
('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230),
('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18),
('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30),
('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32),
('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28),
('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30),
('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32),
('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34),
('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220),
('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220),
('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220),
('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230),
('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220),
('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230),
('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230),
('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36),
('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220),
('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220),
('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220),
('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220),
('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220),
('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220),
('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220),
('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220),
('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220),
('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230),
('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230),
('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230),
('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230),
('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230),
('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230),
('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230),
('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27),
('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29),
('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220),
('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220),
('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7),
('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230),
('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230),
('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9),
('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9),
('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9),
('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9),
('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9),
('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91),
('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9),
('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9),
('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9),
('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118),
('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220),
('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220),
('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129),
('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132),
('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130),
('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9),
('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220),
('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9),
('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230),
('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9),
('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230),
('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222),
('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220),
('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220),
('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230),
('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7),
('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230),
('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230),
('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7),
('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7),
('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1),
('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230),
('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230),
('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220),
('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230),
('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230),
('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230),
('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214),
('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202),
('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233),
('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230),
('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230),
('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230),
('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230),
('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1),
('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220),
('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1),
('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230),
('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9),
('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218),
('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232),
('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224),
('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230),
('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230),
('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9),
('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230),
('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9),
('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9),
('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230),
('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230),
('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230),
('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9),
('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230),
('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220),
('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230),
('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220),
('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9),
('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7),
('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9),
('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9),
('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216),
('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226),
('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220),
('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220),
('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230)
];
pub fn canonical_combining_class(c: char) -> u8 {
bsearch_range_value_table(c, combining_class_table)
}
}

View File

@ -8,20 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Character manipulation (`char` type, Unicode Scalar Value)
//! Character manipulation.
//!
//! This module provides the `Char` trait, as well as its implementation
//! for the primitive `char` type, in order to allow basic character manipulation.
//!
//! A `char` actually represents a
//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
//! as it can contain any Unicode code point except high-surrogate and
//! low-surrogate code points.
//!
//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
//! however the converse is not always true due to the above range limits
//! and, as such, should be performed via the `from_u32` function..
//! For more details, see ::unicode::char (a.k.a. std::char)
#![allow(non_snake_case_functions)]
#![doc(primitive = "char")]
@ -29,12 +18,6 @@
use mem::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use unicode::{derived_property, property, general_category, conversions};
/// Returns the canonical decomposition of a character.
pub use unicode::normalization::decompose_canonical;
/// Returns the compatibility decomposition of a character.
pub use unicode::normalization::decompose_compatible;
// UTF-8 ranges and tags for encoding characters
static TAG_CONT: u8 = 0b1000_0000u8;
@ -93,84 +76,6 @@ pub fn from_u32(i: u32) -> Option<char> {
}
}
/// Returns whether the specified `char` is considered a Unicode alphabetic
/// code point
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
/// Indicates whether a `char` is in lower case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
///
/// Indicates whether a `char` is in upper case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
///
/// Indicates whether a `char` is whitespace
///
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| property::White_Space(c)
}
///
/// Indicates whether a `char` is alphanumeric
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
///
#[inline]
pub fn is_alphanumeric(c: char) -> bool {
derived_property::Alphabetic(c)
|| general_category::Nd(c)
|| general_category::Nl(c)
|| general_category::No(c)
}
///
/// Indicates whether a `char` is a control code point
///
/// Control code points are defined in terms of the Unicode General Category
/// 'Cc'.
///
#[inline]
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
#[inline]
pub fn is_digit(c: char) -> bool {
general_category::Nd(c)
|| general_category::Nl(c)
|| general_category::No(c)
}
///
/// Checks if a `char` parses as a numeric digit in the given radix
///
@ -227,38 +132,6 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
else { None }
}
/// Convert a char to its uppercase equivalent
///
/// The case-folding performed is the common or simple mapping:
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
///
/// # Return value
///
/// Returns the char itself if no conversion was made
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}
/// Convert a char to its lowercase equivalent
///
/// The case-folding performed is the common or simple mapping
/// see `to_uppercase` for references and more information
///
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}
///
/// Converts a number to the character representing it
///
@ -355,61 +228,8 @@ pub fn len_utf8_bytes(c: char) -> uint {
}
}
/// Useful functions for Unicode characters.
/// Basic `char` manipulations.
pub trait Char {
/// Returns whether the specified character is considered a Unicode
/// alphabetic code point.
fn is_alphabetic(&self) -> bool;
/// Returns whether the specified character satisfies the 'XID_Start'
/// Unicode property.
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
fn is_XID_start(&self) -> bool;
/// Returns whether the specified `char` satisfies the 'XID_Continue'
/// Unicode property.
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
fn is_XID_continue(&self) -> bool;
/// Indicates whether a character is in lowercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Lowercase`.
fn is_lowercase(&self) -> bool;
/// Indicates whether a character is in uppercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Uppercase`.
fn is_uppercase(&self) -> bool;
/// Indicates whether a character is whitespace.
///
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
fn is_whitespace(&self) -> bool;
/// Indicates whether a character is alphanumeric.
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
fn is_alphanumeric(&self) -> bool;
/// Indicates whether a character is a control code point.
///
/// Control code points are defined in terms of the Unicode General
/// Category `Cc`.
fn is_control(&self) -> bool;
/// Indicates whether the character is numeric (Nd, Nl, or No).
fn is_digit(&self) -> bool;
/// Checks if a `char` parses as a numeric digit in the given radix.
///
/// Compared to `is_digit()`, this function only recognizes the characters
@ -438,37 +258,6 @@ pub trait Char {
/// Fails if given a radix outside the range [0..36].
fn to_digit(&self, radix: uint) -> Option<uint>;
/// Converts a character to its lowercase equivalent.
///
/// The case-folding performed is the common or simple mapping. See
/// `to_uppercase()` for references and more information.
///
/// # Return value
///
/// Returns the lowercase equivalent of the character, or the character
/// itself if no conversion is possible.
fn to_lowercase(&self) -> char;
/// Converts a character to its uppercase equivalent.
///
/// The case-folding performed is the common or simple mapping: it maps
/// one unicode codepoint (one character in Rust) to its uppercase
/// equivalent according to the Unicode database [1]. The additional
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here [2].
///
/// # Return value
///
/// Returns the uppercase equivalent of the character, or the character
/// itself if no conversion was made.
///
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
///
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
fn to_uppercase(&self) -> char;
/// Converts a number to the character representing it.
///
/// # Return value
@ -526,32 +315,10 @@ pub trait Char {
}
impl Char for char {
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
fn is_control(&self) -> bool { is_control(*self) }
fn is_digit(&self) -> bool { is_digit(*self) }
fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
fn to_lowercase(&self) -> char { to_lowercase(*self) }
fn to_uppercase(&self) -> char { to_uppercase(*self) }
fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
@ -600,5 +367,3 @@ impl Char for char {
}
}
}

View File

@ -108,7 +108,6 @@ pub mod collections;
/* Core types and methods on primitives */
mod unicode;
pub mod any;
pub mod atomics;
pub mod bool;

View File

@ -22,7 +22,7 @@ use cmp;
use cmp::{PartialEq, Eq};
use collections::Collection;
use default::Default;
use iter::{Filter, Map, Iterator};
use iter::{Map, Iterator};
use iter::{DoubleEndedIterator, ExactSize};
use iter::range;
use num::{CheckedMul, Saturating};
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
invert: bool,
}
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
pub type AnyLines<'a> =
Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
/// ```
fn lines_any(&self) -> AnyLines<'a>;
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
///
/// # Example
///
/// ```rust
/// let some_words = " Mary had\ta little \n\t lamb";
/// let v: Vec<&str> = some_words.words().collect();
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
/// ```
fn words(&self) -> Words<'a>;
/// Returns true if the string contains only whitespace.
///
/// Whitespace characters are determined by `char::is_whitespace`.
///
/// # Example
///
/// ```rust
/// assert!(" \t\n".is_whitespace());
/// assert!("".is_whitespace());
///
/// assert!( !"abc".is_whitespace());
/// ```
fn is_whitespace(&self) -> bool;
/// Returns true if the string contains only alphanumeric code
/// points.
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
///
/// # Example
///
/// ```rust
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
/// assert!("".is_alphanumeric());
///
/// assert!( !" &*~".is_alphanumeric());
/// ```
fn is_alphanumeric(&self) -> bool;
/// Returns the number of Unicode code points (`char`) that a
/// string holds.
///
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
/// Returns true if `needle` is a suffix of the string.
fn ends_with(&self, needle: &str) -> bool;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
/// Returns a string with leading whitespace removed.
fn trim_left(&self) -> &'a str;
/// Returns a string with trailing whitespace removed.
fn trim_right(&self) -> &'a str;
/// Returns a string with characters that match `to_trim` removed.
///
/// # Arguments
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
})
}
#[inline]
fn words(&self) -> Words<'a> {
self.split(char::is_whitespace).filter(|s| !s.is_empty())
}
#[inline]
fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
#[inline]
fn char_len(&self) -> uint { self.chars().count() }
@ -1814,21 +1748,6 @@ impl<'a> StrSlice<'a> for &'a str {
m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
}
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
}
#[inline]
fn trim_left(&self) -> &'a str {
self.trim_left_chars(char::is_whitespace)
}
#[inline]
fn trim_right(&self) -> &'a str {
self.trim_right_chars(char::is_whitespace)
}
#[inline]
fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
let cur = match self.find(|c: char| !to_trim.matches(c)) {

File diff suppressed because it is too large Load Diff

View File

@ -200,3 +200,30 @@ fn test_encode_utf16() {
check('\ua66e', [0xa66e]);
check('\U0001f4a9', [0xd83d, 0xdca9]);
}
#[test]
fn test_width() {
assert_eq!('\x00'.width(false),Some(0));
assert_eq!('\x00'.width(true),Some(0));
assert_eq!('\x0A'.width(false),None);
assert_eq!('\x0A'.width(true),None);
assert_eq!('w'.width(false),Some(1));
assert_eq!('w'.width(true),Some(1));
assert_eq!(''.width(false),Some(2));
assert_eq!(''.width(true),Some(2));
assert_eq!('\xAD'.width(false),Some(1));
assert_eq!('\xAD'.width(true),Some(1));
assert_eq!('\u1160'.width(false),Some(0));
assert_eq!('\u1160'.width(true),Some(0));
assert_eq!('\u00a1'.width(false),Some(1));
assert_eq!('\u00a1'.width(true),Some(2));
assert_eq!('\u0300'.width(false),Some(0));
assert_eq!('\u0300'.width(true),Some(0));
}

View File

@ -306,12 +306,15 @@
//!
//! ## Perl character classes (Unicode friendly)
//!
//! These classes are based on the definitions provided in
//! [UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
//!
//! <pre class="rust">
//! \d digit ([0-9] + \p{Nd})
//! \d digit (\p{Nd})
//! \D not digit
//! \s whitespace ([\t\n\f\r ] + \p{Z})
//! \s whitespace (\p{White_Space})
//! \S not whitespace
//! \w word character ([0-9A-Za-z_] + \p{L})
//! \w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
//! \W not word character
//! </pre>
//!
@ -378,6 +381,9 @@ extern crate rand;
#[cfg(test)]
extern crate regex;
// unicode tables for character classes are defined in libunicode
extern crate unicode;
pub use parse::Error;
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
pub use re::{FindCaptures, FindMatches};

View File

@ -16,9 +16,7 @@ use std::num;
use std::str;
/// Static data containing Unicode ranges for general categories and scripts.
use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
#[allow(visible_private_types)]
pub mod unicode;
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
/// The maximum number of repetitions allowed with the `{n,m}` syntax.
static MAX_REPEAT: uint = 1000;

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ use compile::{
Save, Jump, Split,
};
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
use parse::unicode::PERLW;
use unicode::regex::PERLW;
pub type CaptureLocs = Vec<Option<uint>>;

View File

@ -237,6 +237,7 @@ use str::{Str, StrSlice, StrAllocating};
use str;
use string::String;
use uint;
use unicode::UnicodeChar;
use vec::Vec;
// Reexports

View File

@ -126,6 +126,7 @@
#[cfg(test)] #[phase(plugin, link)] extern crate log;
extern crate alloc;
extern crate unicode;
extern crate core;
extern crate core_collections = "collections";
extern crate core_rand = "rand";
@ -148,7 +149,6 @@ extern crate rustrt;
pub use core::any;
pub use core::bool;
pub use core::cell;
pub use core::char;
pub use core::clone;
#[cfg(not(test))] pub use core::cmp;
pub use core::default;
@ -180,6 +180,8 @@ pub use core_collections::vec;
pub use rustrt::c_str;
pub use rustrt::local_data;
pub use unicode::char;
pub use core_sync::comm;
// Run tests with libgreen instead of libnative.

View File

@ -24,6 +24,7 @@ use option::{Option, Some, None};
use slice::{Vector, ImmutableVector};
use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
use string::String;
use unicode::UnicodeChar;
use vec::Vec;
use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
@ -997,7 +998,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
let idx = path.find('\\');
if idx == Some(2) && path.as_bytes()[1] == ':' as u8 {
let c = path.as_bytes()[0];
if c.is_ascii() && ::char::is_alphabetic(c as char) {
if c.is_ascii() && (c as char).is_alphabetic() {
// \\?\C:\ path
return Some(VerbatimDiskPrefix);
}
@ -1021,7 +1022,7 @@ fn parse_prefix<'a>(mut path: &'a str) -> Option<PathPrefix> {
} else if path.len() > 1 && path.as_bytes()[1] == ':' as u8 {
// C:
let c = path.as_bytes()[0];
if c.is_ascii() && ::char::is_alphabetic(c as char) {
if c.is_ascii() && (c as char).is_alphabetic() {
return Some(DiskPrefix);
}
}

View File

@ -88,6 +88,7 @@
#[doc(no_inline)] pub use slice::{Vector, VectorVector};
#[doc(no_inline)] pub use slice::MutableVectorAllocating;
#[doc(no_inline)] pub use string::String;
#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
#[doc(no_inline)] pub use vec::Vec;
// Reexported runtime types

View File

@ -12,7 +12,6 @@
#![allow(non_camel_case_types)]
use char::Char;
use collections::Collection;
use from_str::from_str;
use io::{IoResult, Writer};
@ -22,6 +21,7 @@ use os;
use result::{Ok, Err};
use str::StrSlice;
use sync::atomics;
use unicode::UnicodeChar;
pub use self::imp::write;

111
src/libunicode/decompose.rs Normal file
View File

@ -0,0 +1,111 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
Functions for computing canonical and compatible decompositions
for Unicode characters.
*/
use core::option::{Option, Some, None};
use core::slice::ImmutableVector;
use tables::normalization::{canonical_table, compatibility_table};
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
use core::cmp::{Equal, Less, Greater};
match r.bsearch(|&(val, _)| {
if c == val { Equal }
else if val < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, result) = r[idx];
Some(result)
}
None => None
}
}
/// Compute canonical Unicode decomposition for character
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
/// Compute canonical or compatible Unicode decomposition for character
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
fn d(c: char, i: |char|, k: bool) {
use core::iter::Iterator;
// 7-bit ASCII never decomposes
if c <= '\x7f' { i(c); return; }
// Perform decomposition for Hangul
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
decompose_hangul(c, i);
return;
}
// First check the canonical decompositions
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Bottom out if we're not doing compat.
if !k { i(c); return; }
// Then check the compatibility decompositions
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}
// Finally bottom out.
i(c);
}
// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
use core::mem::transmute;
let si = s as u32 - S_BASE;
let li = si / N_COUNT;
unsafe {
f(transmute(L_BASE + li));
let vi = (si % N_COUNT) / T_COUNT;
f(transmute(V_BASE + vi));
let ti = si % T_COUNT;
if ti > 0 {
f(transmute(T_BASE + ti));
}
}
}

77
src/libunicode/lib.rs Normal file
View File

@ -0,0 +1,77 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! # The Unicode Library
//!
//! Unicode-intensive functions for `char` and `str` types.
//!
//! This crate provides a collection of Unicode-related functionality,
//! including decompositions, conversions, etc., and provides traits
//! implementing these functions for the `char` and `str` types.
//!
//! The functionality included here is only that which is necessary to
//! provide for basic string-related manipulations. This crate does not
//! (yet) aim to provide a full set of Unicode tables.
#![crate_id = "unicode#0.11.0"]
#![crate_name = "unicode"]
#![experimental]
#![license = "MIT/ASL2"]
#![crate_type = "rlib"]
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
html_root_url = "http://doc.rust-lang.org/",
html_playground_url = "http://play.rust-lang.org/")]
#![no_std]
#![allow(unused_attribute)] // NOTE: remove after stage0
extern crate core;
pub use tables::normalization::canonical_combining_class;
pub use tables::regex;
pub use u_char::UnicodeChar;
pub use u_str::UnicodeStrSlice;
pub use u_str::Words;
mod decompose;
mod tables;
mod u_char;
mod u_str;
// re-export char so that std et al see it correctly
/// Character manipulation (`char` type, Unicode Scalar Value)
///
/// This module provides the `Char` and `UnicodeChar` traits, as well as their
/// implementation for the primitive `char` type, in order to allow basic character
/// manipulation.
///
/// A `char` actually represents a
/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
/// as it can contain any Unicode code point except high-surrogate and
/// low-surrogate code points.
///
/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
/// however the converse is not always true due to the above range limits
/// and, as such, should be performed via the `from_u32` function..
pub mod char {
pub use core::char::{MAX, from_u32, is_digit_radix, to_digit};
pub use core::char::{from_digit, escape_unicode, escape_default};
pub use core::char::{len_utf8_bytes, Char};
pub use decompose::decompose_canonical;
pub use decompose::decompose_compatible;
pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
pub use u_char::{is_alphanumeric, is_control, is_digit};
pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
}

6445
src/libunicode/tables.rs Normal file

File diff suppressed because it is too large Load Diff

266
src/libunicode/u_char.rs Normal file
View File

@ -0,0 +1,266 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
* Unicode-intensive `char` methods.
*
* These methods implement functionality for `char` that requires knowledge of
* Unicode definitions, including normalization, categorization, and display information.
*/
use core::option::Option;
use tables::{derived_property, property, general_category, conversions, charwidth};
/// Returns whether the specified `char` is considered a Unicode alphabetic
/// code point
pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
/// Indicates whether a `char` is in lower case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
///
/// Indicates whether a `char` is in upper case
///
/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
///
/// Indicates whether a `char` is whitespace
///
/// Whitespace is defined in terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| property::White_Space(c)
}
///
/// Indicates whether a `char` is alphanumeric
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
///
#[inline]
pub fn is_alphanumeric(c: char) -> bool {
derived_property::Alphabetic(c)
|| general_category::N(c)
}
///
/// Indicates whether a `char` is a control code point
///
/// Control code points are defined in terms of the Unicode General Category
/// 'Cc'.
///
#[inline]
pub fn is_control(c: char) -> bool { general_category::Cc(c) }
/// Indicates whether the `char` is numeric (Nd, Nl, or No)
#[inline]
pub fn is_digit(c: char) -> bool {
general_category::N(c)
}
/// Convert a char to its uppercase equivalent
///
/// The case-folding performed is the common or simple mapping:
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
///
/// # Return value
///
/// Returns the char itself if no conversion was made
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}
/// Convert a char to its lowercase equivalent
///
/// The case-folding performed is the common or simple mapping
/// see `to_uppercase` for references and more information
///
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}
/// Returns this character's displayed width in columns, or `None` if it is a
/// control character other than `'\x00'`.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the context cannot be reliably determined.
pub fn width(c: char, is_cjk: bool) -> Option<uint> {
charwidth::width(c, is_cjk)
}
/// Useful functions for Unicode characters.
pub trait UnicodeChar {
/// Returns whether the specified character is considered a Unicode
/// alphabetic code point.
fn is_alphabetic(&self) -> bool;
/// Returns whether the specified character satisfies the 'XID_Start'
/// Unicode property.
///
/// 'XID_Start' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to ID_Start but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
fn is_XID_start(&self) -> bool;
/// Returns whether the specified `char` satisfies the 'XID_Continue'
/// Unicode property.
///
/// 'XID_Continue' is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
#[allow(non_snake_case_functions)]
fn is_XID_continue(&self) -> bool;
/// Indicates whether a character is in lowercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Lowercase`.
fn is_lowercase(&self) -> bool;
/// Indicates whether a character is in uppercase.
///
/// This is defined according to the terms of the Unicode Derived Core
/// Property `Uppercase`.
fn is_uppercase(&self) -> bool;
/// Indicates whether a character is whitespace.
///
/// Whitespace is defined in terms of the Unicode Property `White_Space`.
fn is_whitespace(&self) -> bool;
/// Indicates whether a character is alphanumeric.
///
/// Alphanumericness is defined in terms of the Unicode General Categories
/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
fn is_alphanumeric(&self) -> bool;
/// Indicates whether a character is a control code point.
///
/// Control code points are defined in terms of the Unicode General
/// Category `Cc`.
fn is_control(&self) -> bool;
/// Indicates whether the character is numeric (Nd, Nl, or No).
fn is_digit(&self) -> bool;
/// Converts a character to its lowercase equivalent.
///
/// The case-folding performed is the common or simple mapping. See
/// `to_uppercase()` for references and more information.
///
/// # Return value
///
/// Returns the lowercase equivalent of the character, or the character
/// itself if no conversion is possible.
fn to_lowercase(&self) -> char;
/// Converts a character to its uppercase equivalent.
///
/// The case-folding performed is the common or simple mapping: it maps
/// one unicode codepoint (one character in Rust) to its uppercase
/// equivalent according to the Unicode database [1]. The additional
/// `SpecialCasing.txt` is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here [2].
///
/// # Return value
///
/// Returns the uppercase equivalent of the character, or the character
/// itself if no conversion was made.
///
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
///
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
fn to_uppercase(&self) -> char;
/// Returns this character's displayed width in columns, or `None` if it is a
/// control character other than `'\x00'`.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the context cannot be reliably determined.
fn width(&self, is_cjk: bool) -> Option<uint>;
}
impl UnicodeChar for char {
fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
fn is_XID_start(&self) -> bool { is_XID_start(*self) }
fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
fn is_lowercase(&self) -> bool { is_lowercase(*self) }
fn is_uppercase(&self) -> bool { is_uppercase(*self) }
fn is_whitespace(&self) -> bool { is_whitespace(*self) }
fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
fn is_control(&self) -> bool { is_control(*self) }
fn is_digit(&self) -> bool { is_digit(*self) }
fn to_lowercase(&self) -> char { to_lowercase(*self) }
fn to_uppercase(&self) -> char { to_uppercase(*self) }
fn width(&self, is_cjk: bool) -> Option<uint> { width(*self, is_cjk) }
}

119
src/libunicode/u_str.rs Normal file
View File

@ -0,0 +1,119 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
* Unicode-intensive string manipulations.
*
* This module provides functionality to `str` that requires the Unicode
* methods provided by the UnicodeChar trait.
*/
use core::collections::Collection;
use core::iter::{Filter};
use core::str::{CharSplits, StrSlice};
use core::iter::Iterator;
use u_char;
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
/// Methods for Unicode string slices
pub trait UnicodeStrSlice<'a> {
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
///
/// # Example
///
/// ```rust
/// let some_words = " Mary had\ta little \n\t lamb";
/// let v: Vec<&str> = some_words.words().collect();
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
/// ```
fn words(&self) -> Words<'a>;
/// Returns true if the string contains only whitespace.
///
/// Whitespace characters are determined by `char::is_whitespace`.
///
/// # Example
///
/// ```rust
/// assert!(" \t\n".is_whitespace());
/// assert!("".is_whitespace());
///
/// assert!( !"abc".is_whitespace());
/// ```
fn is_whitespace(&self) -> bool;
/// Returns true if the string contains only alphanumeric code
/// points.
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
///
/// # Example
///
/// ```rust
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
/// assert!("".is_alphanumeric());
///
/// assert!( !" &*~".is_alphanumeric());
/// ```
fn is_alphanumeric(&self) -> bool;
/// Returns a string's displayed width in columns, treating control
/// characters as zero-width.
///
/// `is_cjk` determines behavior for characters in the Ambiguous category:
/// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
/// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the locale is unknown.
//fn width(&self, is_cjk: bool) -> uint;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
/// Returns a string with leading whitespace removed.
fn trim_left(&self) -> &'a str;
/// Returns a string with trailing whitespace removed.
fn trim_right(&self) -> &'a str;
}
impl<'a> UnicodeStrSlice<'a> for &'a str {
#[inline]
fn words(&self) -> Words<'a> {
self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
}
#[inline]
fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
}
#[inline]
fn trim_left(&self) -> &'a str {
self.trim_left_chars(u_char::is_whitespace)
}
#[inline]
fn trim_right(&self) -> &'a str {
self.trim_right_chars(u_char::is_whitespace)
}
}