mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-27 01:04:03 +00:00
std: Stabilize the std::str module
This commit starts out by consolidating all `str` extension traits into one `StrExt` trait to be included in the prelude. This means that `UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into one `StrExt` exported by the standard library. Some functionality is currently duplicated with the `StrExt` present in libcore. This commit also currently avoids any methods which require any form of pattern to operate. These functions will be stabilized via a separate RFC. Next, stability of methods and structures are as follows: Stable * from_utf8_unchecked * CowString - after moving to std::string * StrExt::as_bytes * StrExt::as_ptr * StrExt::bytes/Bytes - also made a struct instead of a typedef * StrExt::char_indices/CharIndices - CharOffsets was renamed * StrExt::chars/Chars * StrExt::is_empty * StrExt::len * StrExt::lines/Lines * StrExt::lines_any/LinesAny * StrExt::slice_unchecked * StrExt::trim * StrExt::trim_left * StrExt::trim_right * StrExt::words/Words - also made a struct instead of a typedef Unstable * from_utf8 - the error type was changed to a `Result`, but the error type has yet to prove itself * from_c_str - this function will be handled by the c_str RFC * FromStr - this trait will have an associated error type eventually * StrExt::escape_default - needs iterators at least, unsure if it should make the cut * StrExt::escape_unicode - needs iterators at least, unsure if it should make the cut * StrExt::slice_chars - this function has yet to prove itself * StrExt::slice_shift_char - awaiting conventions about slicing and shifting * StrExt::graphemes/Graphemes - this functionality may only be in libunicode * StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in libunicode * StrExt::width - this functionality may only be in libunicode * StrExt::utf16_units - this functionality may only be in libunicode * StrExt::nfd_chars - this functionality may only be in libunicode * StrExt::nfkd_chars - this functionality may only be in libunicode * StrExt::nfc_chars - this functionality may only be in libunicode * StrExt::nfkc_chars - this functionality may only be in libunicode * StrExt::is_char_boundary - naming is uncertain with container conventions * StrExt::char_range_at - naming is uncertain with container conventions * StrExt::char_range_at_reverse - naming is uncertain with container conventions * StrExt::char_at - naming is uncertain with container conventions * StrExt::char_at_reverse - naming is uncertain with container conventions * StrVector::concat - this functionality may be replaced with iterators, but it's not certain at this time * StrVector::connect - as with concat, may be deprecated in favor of iterators Deprecated * StrAllocating and UnicodeStrPrelude have been merged into StrExit * eq_slice - compiler implementation detail * from_str - use the inherent parse() method * is_utf8 - call from_utf8 instead * replace - call the method instead * truncate_utf16_at_nul - this is an implementation detail of windows and does not need to be exposed. * utf8_char_width - moved to libunicode * utf16_items - moved to libunicode * is_utf16 - moved to libunicode * Utf16Items - moved to libunicode * Utf16Item - moved to libunicode * Utf16Encoder - moved to libunicode * AnyLines - renamed to LinesAny and made a struct * SendStr - use CowString<'static> instead * str::raw - all functionality is deprecated * StrExt::into_string - call to_string() instead * StrExt::repeat - use iterators instead * StrExt::char_len - use .chars().count() instead * StrExt::is_alphanumeric - use .chars().all(..) * StrExt::is_whitespace - use .chars().all(..) Pending deprecation -- while slicing syntax is being worked out, these methods are all #[unstable] * Str - while currently used for generic programming, this trait will be replaced with one of [], deref coercions, or a generic conversion trait. * StrExt::slice - use slicing syntax instead * StrExt::slice_to - use slicing syntax instead * StrExt::slice_from - use slicing syntax instead * StrExt::lev_distance - deprecated with no replacement Awaiting stabilization due to patterns and/or matching * StrExt::contains * StrExt::contains_char * StrExt::split * StrExt::splitn * StrExt::split_terminator * StrExt::rsplitn * StrExt::match_indices * StrExt::split_str * StrExt::starts_with * StrExt::ends_with * StrExt::trim_chars * StrExt::trim_left_chars * StrExt::trim_right_chars * StrExt::find * StrExt::rfind * StrExt::find_str * StrExt::subslice_offset
This commit is contained in:
parent
34d6800092
commit
4908017d59
File diff suppressed because it is too large
Load Diff
@ -21,13 +21,12 @@ use core::hash;
|
||||
use core::mem;
|
||||
use core::ptr;
|
||||
use core::ops;
|
||||
// FIXME: ICE's abound if you import the `Slice` type while importing `Slice` trait
|
||||
use core::raw::Slice as RawSlice;
|
||||
use unicode::str as unicode_str;
|
||||
use unicode::str::Utf16Item;
|
||||
|
||||
use slice::CloneSliceExt;
|
||||
use str;
|
||||
use str::{CharRange, CowString, FromStr, StrAllocating};
|
||||
use str::MaybeOwned::Owned;
|
||||
use str::{mod, CharRange, FromStr, StrExt, Owned, Utf8Error};
|
||||
use vec::{DerefVec, Vec, as_vec};
|
||||
|
||||
/// A growable string stored as a UTF-8 encoded buffer.
|
||||
@ -87,8 +86,10 @@ impl String {
|
||||
/// Returns the vector as a string buffer, if possible, taking care not to
|
||||
/// copy it.
|
||||
///
|
||||
/// Returns `Err` with the original vector if the vector contains invalid
|
||||
/// UTF-8.
|
||||
/// # Failure
|
||||
///
|
||||
/// If the given vector is not valid UTF-8, then the original vector and the
|
||||
/// corresponding error is returned.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -103,11 +104,10 @@ impl String {
|
||||
/// ```
|
||||
#[inline]
|
||||
#[unstable = "error type may change"]
|
||||
pub fn from_utf8(vec: Vec<u8>) -> Result<String, Vec<u8>> {
|
||||
if str::is_utf8(vec.as_slice()) {
|
||||
Ok(String { vec: vec })
|
||||
} else {
|
||||
Err(vec)
|
||||
pub fn from_utf8(vec: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
|
||||
match str::from_utf8(vec.as_slice()) {
|
||||
Ok(..) => Ok(String { vec: vec }),
|
||||
Err(e) => Err((vec, e))
|
||||
}
|
||||
}
|
||||
|
||||
@ -123,8 +123,9 @@ impl String {
|
||||
/// ```
|
||||
#[unstable = "return type may change"]
|
||||
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> CowString<'a> {
|
||||
if str::is_utf8(v) {
|
||||
return Cow::Borrowed(unsafe { mem::transmute(v) })
|
||||
match str::from_utf8(v) {
|
||||
Ok(s) => return Cow::Borrowed(s),
|
||||
Err(..) => {}
|
||||
}
|
||||
|
||||
static TAG_CONT_U8: u8 = 128u8;
|
||||
@ -173,7 +174,7 @@ impl String {
|
||||
if byte < 128u8 {
|
||||
// subseqidx handles this
|
||||
} else {
|
||||
let w = str::utf8_char_width(byte);
|
||||
let w = unicode_str::utf8_char_width(byte);
|
||||
|
||||
match w {
|
||||
2 => {
|
||||
@ -235,7 +236,7 @@ impl String {
|
||||
res.as_mut_vec().push_all(v[subseqidx..total])
|
||||
};
|
||||
}
|
||||
Cow::Owned(res.into_string())
|
||||
Cow::Owned(res)
|
||||
}
|
||||
|
||||
/// Decode a UTF-16 encoded vector `v` into a `String`, returning `None`
|
||||
@ -256,10 +257,10 @@ impl String {
|
||||
#[unstable = "error value in return may change"]
|
||||
pub fn from_utf16(v: &[u16]) -> Option<String> {
|
||||
let mut s = String::with_capacity(v.len());
|
||||
for c in str::utf16_items(v) {
|
||||
for c in unicode_str::utf16_items(v) {
|
||||
match c {
|
||||
str::ScalarValue(c) => s.push(c),
|
||||
str::LoneSurrogate(_) => return None
|
||||
Utf16Item::ScalarValue(c) => s.push(c),
|
||||
Utf16Item::LoneSurrogate(_) => return None
|
||||
}
|
||||
}
|
||||
Some(s)
|
||||
@ -281,7 +282,7 @@ impl String {
|
||||
/// ```
|
||||
#[stable]
|
||||
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
||||
str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
||||
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
||||
}
|
||||
|
||||
/// Convert a vector of `char`s to a `String`.
|
||||
@ -812,21 +813,12 @@ impl<'a, 'b> PartialEq<CowString<'a>> for &'b str {
|
||||
}
|
||||
|
||||
#[experimental = "waiting on Str stabilization"]
|
||||
#[allow(deprecated)]
|
||||
impl Str for String {
|
||||
#[inline]
|
||||
#[stable]
|
||||
fn as_slice<'a>(&'a self) -> &'a str {
|
||||
unsafe {
|
||||
mem::transmute(self.vec.as_slice())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[experimental = "waiting on StrAllocating stabilization"]
|
||||
impl StrAllocating for String {
|
||||
#[inline]
|
||||
fn into_string(self) -> String {
|
||||
self
|
||||
unsafe { mem::transmute(self.vec.as_slice()) }
|
||||
}
|
||||
}
|
||||
|
||||
@ -841,7 +833,7 @@ impl Default for String {
|
||||
#[experimental = "waiting on Show stabilization"]
|
||||
impl fmt::Show for String {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.as_slice().fmt(f)
|
||||
(*self).fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
@ -849,7 +841,7 @@ impl fmt::Show for String {
|
||||
impl<H: hash::Writer> hash::Hash<H> for String {
|
||||
#[inline]
|
||||
fn hash(&self, hasher: &mut H) {
|
||||
self.as_slice().hash(hasher)
|
||||
(*self).hash(hasher)
|
||||
}
|
||||
}
|
||||
|
||||
@ -873,7 +865,7 @@ impl<'a> Add<&'a str, String> for String {
|
||||
impl ops::Slice<uint, str> for String {
|
||||
#[inline]
|
||||
fn as_slice_<'a>(&'a self) -> &'a str {
|
||||
self.as_slice()
|
||||
unsafe { mem::transmute(self.vec.as_slice()) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -894,7 +886,9 @@ impl ops::Slice<uint, str> for String {
|
||||
|
||||
#[experimental = "waiting on Deref stabilization"]
|
||||
impl ops::Deref<str> for String {
|
||||
fn deref<'a>(&'a self) -> &'a str { self.as_slice() }
|
||||
fn deref<'a>(&'a self) -> &'a str {
|
||||
unsafe { mem::transmute(self.vec[]) }
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper type providing a `&String` reference via `Deref`.
|
||||
@ -1015,6 +1009,18 @@ pub mod raw {
|
||||
}
|
||||
}
|
||||
|
||||
/// A clone-on-write string
|
||||
#[stable]
|
||||
pub type CowString<'a> = Cow<'a, String, str>;
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl<'a> Str for CowString<'a> {
|
||||
#[inline]
|
||||
fn as_slice<'b>(&'b self) -> &'b str {
|
||||
(**self).as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use prelude::*;
|
||||
|
@ -23,7 +23,7 @@ use num::cast;
|
||||
use ops::FnOnce;
|
||||
use result::Result::Ok;
|
||||
use slice::{mod, SliceExt};
|
||||
use str::StrPrelude;
|
||||
use str::StrExt;
|
||||
|
||||
/// A flag that specifies whether to use exponential (scientific) notation.
|
||||
pub enum ExponentFormat {
|
||||
|
@ -24,7 +24,7 @@ use result::Result::{Ok, Err};
|
||||
use result;
|
||||
use slice::SliceExt;
|
||||
use slice;
|
||||
use str::StrPrelude;
|
||||
use str::{StrExt, Utf8Error};
|
||||
|
||||
pub use self::num::radix;
|
||||
pub use self::num::Radix;
|
||||
@ -795,5 +795,18 @@ impl<'b, T: Show> Show for RefMut<'b, T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Show for Utf8Error {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result {
|
||||
match *self {
|
||||
Utf8Error::InvalidByte(n) => {
|
||||
write!(f, "invalid utf-8: invalid byte at index {}", n)
|
||||
}
|
||||
Utf8Error::TooShort => {
|
||||
write!(f, "invalid utf-8: byte slice too short")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If you expected tests to be here, look instead at the run-pass/ifmt.rs test,
|
||||
// it's a lot easier than creating all of the rt::Piece structures here.
|
||||
|
@ -32,7 +32,7 @@ use ops::{Add, Sub, Mul, Div, Rem, Neg};
|
||||
use ops::{Not, BitAnd, BitOr, BitXor, Shl, Shr};
|
||||
use option::Option;
|
||||
use option::Option::{Some, None};
|
||||
use str::{FromStr, from_str, StrPrelude};
|
||||
use str::{FromStr, from_str, StrExt};
|
||||
|
||||
/// Simultaneous division and remainder
|
||||
#[inline]
|
||||
|
@ -60,7 +60,7 @@ pub use option::Option::{Some, None};
|
||||
pub use ptr::RawPtr;
|
||||
pub use result::Result;
|
||||
pub use result::Result::{Ok, Err};
|
||||
pub use str::{Str, StrPrelude};
|
||||
pub use str::{Str, StrExt};
|
||||
pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4};
|
||||
pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8};
|
||||
pub use tuple::{Tuple9, Tuple10, Tuple11, Tuple12};
|
||||
|
1010
src/libcore/str.rs
1010
src/libcore/str.rs
File diff suppressed because it is too large
Load Diff
@ -115,6 +115,7 @@ pub mod util {
|
||||
pub mod ppaux;
|
||||
pub mod nodemap;
|
||||
pub mod snapshot_vec;
|
||||
pub mod lev_distance;
|
||||
}
|
||||
|
||||
pub mod lib {
|
||||
|
63
src/librustc/util/lev_distance.rs
Normal file
63
src/librustc/util/lev_distance.rs
Normal file
@ -0,0 +1,63 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use std::cmp;
|
||||
|
||||
pub fn lev_distance(me: &str, t: &str) -> uint {
|
||||
if me.is_empty() { return t.chars().count(); }
|
||||
if t.is_empty() { return me.chars().count(); }
|
||||
|
||||
let mut dcol = Vec::from_fn(t.len() + 1, |x| x);
|
||||
let mut t_last = 0;
|
||||
|
||||
for (i, sc) in me.chars().enumerate() {
|
||||
|
||||
let mut current = i;
|
||||
dcol[0] = current + 1;
|
||||
|
||||
for (j, tc) in t.chars().enumerate() {
|
||||
|
||||
let next = dcol[j + 1];
|
||||
|
||||
if sc == tc {
|
||||
dcol[j + 1] = current;
|
||||
} else {
|
||||
dcol[j + 1] = cmp::min(current, next);
|
||||
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
|
||||
}
|
||||
|
||||
current = next;
|
||||
t_last = j;
|
||||
}
|
||||
}
|
||||
|
||||
dcol[t_last + 1]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lev_distance() {
|
||||
use std::char::{ from_u32, MAX };
|
||||
// Test bytelength agnosticity
|
||||
for c in range(0u32, MAX as u32)
|
||||
.filter_map(|i| from_u32(i))
|
||||
.map(|i| String::from_char(1, i)) {
|
||||
assert_eq!(lev_distance(c[], c[]), 0);
|
||||
}
|
||||
|
||||
let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
|
||||
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
|
||||
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
|
||||
assert_eq!(lev_distance(a, b), 1);
|
||||
assert_eq!(lev_distance(b, a), 1);
|
||||
assert_eq!(lev_distance(a, c), 2);
|
||||
assert_eq!(lev_distance(c, a), 2);
|
||||
assert_eq!(lev_distance(b, c), 1);
|
||||
assert_eq!(lev_distance(c, b), 1);
|
||||
}
|
@ -57,6 +57,7 @@ use rustc::middle::privacy::*;
|
||||
use rustc::middle::subst::{ParamSpace, FnSpace, TypeSpace};
|
||||
use rustc::middle::ty::{CaptureModeMap, Freevar, FreevarMap, TraitMap};
|
||||
use rustc::util::nodemap::{NodeMap, NodeSet, DefIdSet, FnvHashMap};
|
||||
use rustc::util::lev_distance::lev_distance;
|
||||
|
||||
use syntax::ast::{Arm, BindByRef, BindByValue, BindingMode, Block, Crate, CrateNum};
|
||||
use syntax::ast::{DeclItem, DefId, Expr, ExprAgain, ExprBreak, ExprField};
|
||||
@ -96,8 +97,8 @@ use std::mem::replace;
|
||||
use std::rc::{Rc, Weak};
|
||||
use std::uint;
|
||||
|
||||
mod check_unused;
|
||||
mod record_exports;
|
||||
// Definition mapping
|
||||
pub type DefMap = RefCell<NodeMap<Def>>;
|
||||
|
||||
#[deriving(Copy)]
|
||||
struct BindingInfo {
|
||||
@ -5539,7 +5540,7 @@ impl<'a> Resolver<'a> {
|
||||
|
||||
let mut smallest = 0;
|
||||
for (i, other) in maybes.iter().enumerate() {
|
||||
values[i] = name.lev_distance(other.get());
|
||||
values[i] = lev_distance(name, other.get());
|
||||
|
||||
if values[i] <= values[smallest] {
|
||||
smallest = i;
|
||||
|
@ -78,10 +78,9 @@
|
||||
//! }
|
||||
//! ```
|
||||
|
||||
use option::Option;
|
||||
use option::Option::None;
|
||||
use kinds::Send;
|
||||
use string::String;
|
||||
use prelude::*;
|
||||
|
||||
use str::Utf8Error;
|
||||
|
||||
/// Base functionality for all errors in Rust.
|
||||
pub trait Error: Send {
|
||||
@ -107,3 +106,14 @@ impl<E> FromError<E> for E {
|
||||
err
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for Utf8Error {
|
||||
fn description(&self) -> &str {
|
||||
match *self {
|
||||
Utf8Error::TooShort => "invalid utf-8: not enough bytes",
|
||||
Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents",
|
||||
}
|
||||
}
|
||||
|
||||
fn detail(&self) -> Option<String> { Some(self.to_string()) }
|
||||
}
|
||||
|
@ -729,7 +729,7 @@ fn real_args() -> Vec<String> {
|
||||
// Push it onto the list.
|
||||
let ptr = ptr as *const u16;
|
||||
let buf = slice::from_raw_buf(&ptr, len);
|
||||
let opt_s = String::from_utf16(::str::truncate_utf16_at_nul(buf));
|
||||
let opt_s = String::from_utf16(os_imp::truncate_utf16_at_nul(buf));
|
||||
opt_s.expect("CommandLineToArgvW returned invalid UTF-16")
|
||||
});
|
||||
|
||||
|
@ -31,6 +31,16 @@ use libc::types::os::arch::extra::DWORD;
|
||||
|
||||
const BUF_BYTES : uint = 2048u;
|
||||
|
||||
/// Return a slice of `v` ending at (and not including) the first NUL
|
||||
/// (0).
|
||||
pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
|
||||
match v.iter().position(|c| *c == 0) {
|
||||
// don't include the 0
|
||||
Some(i) => v[..i],
|
||||
None => v
|
||||
}
|
||||
}
|
||||
|
||||
pub fn errno() -> uint {
|
||||
use libc::types::os::arch::extra::DWORD;
|
||||
|
||||
@ -87,7 +97,7 @@ pub fn error_string(errnum: i32) -> String {
|
||||
return format!("OS Error {} (FormatMessageW() returned error {})", errnum, fm_err);
|
||||
}
|
||||
|
||||
let msg = String::from_utf16(::str::truncate_utf16_at_nul(&buf));
|
||||
let msg = String::from_utf16(truncate_utf16_at_nul(&buf));
|
||||
match msg {
|
||||
Some(msg) => format!("OS Error {}: {}", errnum, msg),
|
||||
None => format!("OS Error {} (FormatMessageW() returned invalid UTF-16)", errnum),
|
||||
@ -294,3 +304,30 @@ pub fn page_size() -> uint {
|
||||
return info.dwPageSize as uint;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::truncate_utf16_at_nul;
|
||||
|
||||
#[test]
|
||||
fn test_truncate_utf16_at_nul() {
|
||||
let v = [];
|
||||
let b: &[u16] = &[];
|
||||
assert_eq!(truncate_utf16_at_nul(&v), b);
|
||||
|
||||
let v = [0, 2, 3];
|
||||
assert_eq!(truncate_utf16_at_nul(&v), b);
|
||||
|
||||
let v = [1, 0, 3];
|
||||
let b: &[u16] = &[1];
|
||||
assert_eq!(truncate_utf16_at_nul(&v), b);
|
||||
|
||||
let v = [1, 2, 0];
|
||||
let b: &[u16] = &[1, 2];
|
||||
assert_eq!(truncate_utf16_at_nul(&v), b);
|
||||
|
||||
let v = [1, 2, 3];
|
||||
let b: &[u16] = &[1, 2, 3];
|
||||
assert_eq!(truncate_utf16_at_nul(&v), b);
|
||||
}
|
||||
}
|
||||
|
@ -28,8 +28,7 @@
|
||||
html_root_url = "http://doc.rust-lang.org/nightly/",
|
||||
html_playground_url = "http://play.rust-lang.org/")]
|
||||
#![no_std]
|
||||
#![feature(globs)]
|
||||
#![feature(unboxed_closures)]
|
||||
#![feature(globs, macro_rules, slicing_syntax, unboxed_closures)]
|
||||
|
||||
extern crate core;
|
||||
|
||||
@ -74,11 +73,14 @@ pub mod char {
|
||||
}
|
||||
|
||||
pub mod str {
|
||||
pub use u_str::{UnicodeStrPrelude, Words, Graphemes, GraphemeIndices};
|
||||
pub use u_str::{UnicodeStr, Words, Graphemes, GraphemeIndices};
|
||||
pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};
|
||||
pub use u_str::{utf16_items, Utf16Encoder};
|
||||
}
|
||||
|
||||
// this lets us use #[deriving(Clone)]
|
||||
// this lets us use #[deriving(..)]
|
||||
mod std {
|
||||
pub use core::clone;
|
||||
pub use core::cmp;
|
||||
pub use core::fmt;
|
||||
}
|
||||
|
@ -15,24 +15,36 @@
|
||||
//! This module provides functionality to `str` that requires the Unicode methods provided by the
|
||||
//! UnicodeChar trait.
|
||||
|
||||
use self::GraphemeState::*;
|
||||
use core::prelude::*;
|
||||
|
||||
use core::char;
|
||||
use core::cmp;
|
||||
use core::slice::SliceExt;
|
||||
use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
|
||||
use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt};
|
||||
use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
|
||||
use core::iter::{Filter, AdditiveIterator};
|
||||
use core::kinds::Sized;
|
||||
use core::option::Option;
|
||||
use core::mem;
|
||||
use core::num::Int;
|
||||
use core::option::Option::{None, Some};
|
||||
use core::option::Option;
|
||||
use core::slice::SliceExt;
|
||||
use core::slice;
|
||||
use core::str::{CharSplits, StrPrelude};
|
||||
use core::str::{CharSplits};
|
||||
|
||||
use u_char::UnicodeChar;
|
||||
use tables::grapheme::GraphemeCat;
|
||||
|
||||
/// An iterator over the words of a string, separated by a sequence of whitespace
|
||||
/// FIXME: This should be opaque
|
||||
pub type Words<'a> = Filter<&'a str, CharSplits<'a, fn(char) -> bool>, fn(&&str) -> bool>;
|
||||
#[stable]
|
||||
pub struct Words<'a> {
|
||||
inner: Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>,
|
||||
fn(&&str) -> bool>,
|
||||
}
|
||||
|
||||
/// Methods for Unicode string slices
|
||||
pub trait UnicodeStrPrelude for Sized? {
|
||||
pub trait UnicodeStr for Sized? {
|
||||
/// Returns an iterator over the
|
||||
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
|
||||
/// of the string.
|
||||
@ -77,6 +89,7 @@ pub trait UnicodeStrPrelude for Sized? {
|
||||
/// let v: Vec<&str> = some_words.words().collect();
|
||||
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
|
||||
/// ```
|
||||
#[stable]
|
||||
fn words<'a>(&'a self) -> Words<'a>;
|
||||
|
||||
/// Returns true if the string contains only whitespace.
|
||||
@ -129,7 +142,7 @@ pub trait UnicodeStrPrelude for Sized? {
|
||||
fn trim_right<'a>(&'a self) -> &'a str;
|
||||
}
|
||||
|
||||
impl UnicodeStrPrelude for str {
|
||||
impl UnicodeStr for str {
|
||||
#[inline]
|
||||
fn graphemes(&self, is_extended: bool) -> Graphemes {
|
||||
Graphemes { string: self, extended: is_extended, cat: None, catb: None }
|
||||
@ -145,7 +158,7 @@ impl UnicodeStrPrelude for str {
|
||||
fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
|
||||
fn is_whitespace(c: char) -> bool { c.is_whitespace() }
|
||||
|
||||
self.split(is_whitespace).filter(is_not_empty)
|
||||
Words { inner: self.split(is_whitespace).filter(is_not_empty) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -428,3 +441,196 @@ impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
|
||||
Some(retstr)
|
||||
}
|
||||
}
|
||||
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
static UTF8_CHAR_WIDTH: [u8, ..256] = [
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
|
||||
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
|
||||
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
|
||||
];
|
||||
|
||||
/// Given a first byte, determine how many bytes are in this UTF-8 character
|
||||
#[inline]
|
||||
pub fn utf8_char_width(b: u8) -> uint {
|
||||
return UTF8_CHAR_WIDTH[b as uint] as uint;
|
||||
}
|
||||
|
||||
/// Determines if a vector of `u16` contains valid UTF-16
|
||||
pub fn is_utf16(v: &[u16]) -> bool {
|
||||
let mut it = v.iter();
|
||||
macro_rules! next ( ($ret:expr) => {
|
||||
match it.next() { Some(u) => *u, None => return $ret }
|
||||
}
|
||||
)
|
||||
loop {
|
||||
let u = next!(true);
|
||||
|
||||
match char::from_u32(u as u32) {
|
||||
Some(_) => {}
|
||||
None => {
|
||||
let u2 = next!(false);
|
||||
if u < 0xD7FF || u > 0xDBFF ||
|
||||
u2 < 0xDC00 || u2 > 0xDFFF { return false; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from a vector
|
||||
/// of `u16`s.
|
||||
#[deriving(Clone)]
|
||||
pub struct Utf16Items<'a> {
|
||||
iter: slice::Items<'a, u16>
|
||||
}
|
||||
/// The possibilities for values decoded from a `u16` stream.
|
||||
#[deriving(PartialEq, Eq, Clone, Show)]
|
||||
pub enum Utf16Item {
|
||||
/// A valid codepoint.
|
||||
ScalarValue(char),
|
||||
/// An invalid surrogate without its pair.
|
||||
LoneSurrogate(u16)
|
||||
}
|
||||
|
||||
impl Copy for Utf16Item {}
|
||||
|
||||
impl Utf16Item {
|
||||
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
|
||||
/// replacement character (U+FFFD).
|
||||
#[inline]
|
||||
pub fn to_char_lossy(&self) -> char {
|
||||
match *self {
|
||||
Utf16Item::ScalarValue(c) => c,
|
||||
Utf16Item::LoneSurrogate(_) => '\uFFFD'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<Utf16Item> for Utf16Items<'a> {
|
||||
fn next(&mut self) -> Option<Utf16Item> {
|
||||
let u = match self.iter.next() {
|
||||
Some(u) => *u,
|
||||
None => return None
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(Utf16Item::LoneSurrogate(u))
|
||||
} else {
|
||||
// preserve state for rewinding.
|
||||
let old = self.iter;
|
||||
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => *u2,
|
||||
// eof
|
||||
None => return Some(Utf16Item::LoneSurrogate(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.iter = old;
|
||||
return Some(Utf16Item::LoneSurrogate(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (uint, Option<uint>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an iterator over the UTF-16 encoded codepoints in `v`,
|
||||
/// returning invalid surrogates as `LoneSurrogate`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::str;
|
||||
/// use std::str::{ScalarValue, LoneSurrogate};
|
||||
///
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(str::utf16_items(&v).collect::<Vec<_>>(),
|
||||
/// vec![ScalarValue('𝄞'),
|
||||
/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
|
||||
/// LoneSurrogate(0xDD1E),
|
||||
/// ScalarValue('i'), ScalarValue('c'),
|
||||
/// LoneSurrogate(0xD834)]);
|
||||
/// ```
|
||||
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
|
||||
Utf16Items { iter : v.iter() }
|
||||
}
|
||||
|
||||
/// Iterator adaptor for encoding `char`s to UTF-16.
|
||||
#[deriving(Clone)]
|
||||
pub struct Utf16Encoder<I> {
|
||||
chars: I,
|
||||
extra: u16
|
||||
}
|
||||
|
||||
impl<I> Utf16Encoder<I> {
|
||||
/// Create an UTF-16 encoder from any `char` iterator.
|
||||
pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<char> {
|
||||
Utf16Encoder { chars: chars, extra: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Iterator<u16> for Utf16Encoder<I> where I: Iterator<char> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<u16> {
|
||||
if self.extra != 0 {
|
||||
let tmp = self.extra;
|
||||
self.extra = 0;
|
||||
return Some(tmp);
|
||||
}
|
||||
|
||||
let mut buf = [0u16, ..2];
|
||||
self.chars.next().map(|ch| {
|
||||
let n = ch.encode_utf16(buf[mut]).unwrap_or(0);
|
||||
if n == 2 { self.extra = buf[1]; }
|
||||
buf[0]
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (uint, Option<uint>) {
|
||||
let (low, high) = self.chars.size_hint();
|
||||
// every char gets either one u16 or two u16,
|
||||
// so this iterator is between 1 or 2 times as
|
||||
// long as the underlying iterator.
|
||||
(low, high.and_then(|n| n.checked_mul(2)))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<&'a str> for Words<'a> {
|
||||
fn next(&mut self) -> Option<&'a str> { self.inner.next() }
|
||||
}
|
||||
impl<'a> DoubleEndedIterator<&'a str> for Words<'a> {
|
||||
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user