std: Stabilize the std::str module

This commit starts out by consolidating all `str` extension traits into one `StrExt` trait to be included in the prelude. This means that `UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into one `StrExt` exported by the standard library. Some functionality is currently duplicated with the `StrExt` present in libcore. This commit also currently avoids any methods which require any form of pattern to operate. These functions will be stabilized via a separate RFC. Next, stability of methods and structures are as follows: Stable * from_utf8_unchecked * CowString - after moving to std::string * StrExt::as_bytes * StrExt::as_ptr * StrExt::bytes/Bytes - also made a struct instead of a typedef * StrExt::char_indices/CharIndices - CharOffsets was renamed * StrExt::chars/Chars * StrExt::is_empty * StrExt::len * StrExt::lines/Lines * StrExt::lines_any/LinesAny * StrExt::slice_unchecked * StrExt::trim * StrExt::trim_left * StrExt::trim_right * StrExt::words/Words - also made a struct instead of a typedef Unstable * from_utf8 - the error type was changed to a `Result`, but the error type has yet to prove itself * from_c_str - this function will be handled by the c_str RFC * FromStr - this trait will have an associated error type eventually * StrExt::escape_default - needs iterators at least, unsure if it should make the cut * StrExt::escape_unicode - needs iterators at least, unsure if it should make the cut * StrExt::slice_chars - this function has yet to prove itself * StrExt::slice_shift_char - awaiting conventions about slicing and shifting * StrExt::graphemes/Graphemes - this functionality may only be in libunicode * StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in libunicode * StrExt::width - this functionality may only be in libunicode * StrExt::utf16_units - this functionality may only be in libunicode * StrExt::nfd_chars - this functionality may only be in libunicode * StrExt::nfkd_chars - this functionality may only be in libunicode * StrExt::nfc_chars - this functionality may only be in libunicode * StrExt::nfkc_chars - this functionality may only be in libunicode * StrExt::is_char_boundary - naming is uncertain with container conventions * StrExt::char_range_at - naming is uncertain with container conventions * StrExt::char_range_at_reverse - naming is uncertain with container conventions * StrExt::char_at - naming is uncertain with container conventions * StrExt::char_at_reverse - naming is uncertain with container conventions * StrVector::concat - this functionality may be replaced with iterators, but it's not certain at this time * StrVector::connect - as with concat, may be deprecated in favor of iterators Deprecated * StrAllocating and UnicodeStrPrelude have been merged into StrExit * eq_slice - compiler implementation detail * from_str - use the inherent parse() method * is_utf8 - call from_utf8 instead * replace - call the method instead * truncate_utf16_at_nul - this is an implementation detail of windows and does not need to be exposed. * utf8_char_width - moved to libunicode * utf16_items - moved to libunicode * is_utf16 - moved to libunicode * Utf16Items - moved to libunicode * Utf16Item - moved to libunicode * Utf16Encoder - moved to libunicode * AnyLines - renamed to LinesAny and made a struct * SendStr - use CowString<'static> instead * str::raw - all functionality is deprecated * StrExt::into_string - call to_string() instead * StrExt::repeat - use iterators instead * StrExt::char_len - use .chars().count() instead * StrExt::is_alphanumeric - use .chars().all(..) * StrExt::is_whitespace - use .chars().all(..) Pending deprecation -- while slicing syntax is being worked out, these methods are all #[unstable] * Str - while currently used for generic programming, this trait will be replaced with one of [], deref coercions, or a generic conversion trait. * StrExt::slice - use slicing syntax instead * StrExt::slice_to - use slicing syntax instead * StrExt::slice_from - use slicing syntax instead * StrExt::lev_distance - deprecated with no replacement Awaiting stabilization due to patterns and/or matching * StrExt::contains * StrExt::contains_char * StrExt::split * StrExt::splitn * StrExt::split_terminator * StrExt::rsplitn * StrExt::match_indices * StrExt::split_str * StrExt::starts_with * StrExt::ends_with * StrExt::trim_chars * StrExt::trim_left_chars * StrExt::trim_right_chars * StrExt::find * StrExt::rfind * StrExt::find_str * StrExt::subslice_offset
2025-05-14 02:49:40 +00:00 · 2014-12-10 09:02:31 -08:00 · 2014-12-10 09:02:31 -08:00 · 4908017d59
commit 4908017d59
parent 34d6800092
15 changed files with 1511 additions and 1078 deletions
--- a/src/libcollections/str.rs
+++ b/src/libcollections/str.rs
--- a/src/libcollections/string.rs
+++ b/src/libcollections/string.rs
@ -21,13 +21,12 @@ use core::hash;
 use core::mem;
 use core::ptr;
 use core::ops;
-// FIXME: ICE's abound if you import the `Slice` type while importing `Slice` trait
 use core::raw::Slice as RawSlice;
+use unicode::str as unicode_str;
+use unicode::str::Utf16Item;

 use slice::CloneSliceExt;
-use str;
-use str::{CharRange, CowString, FromStr, StrAllocating};
-use str::MaybeOwned::Owned;
+use str::{mod, CharRange, FromStr, StrExt, Owned, Utf8Error};
 use vec::{DerefVec, Vec, as_vec};

 /// A growable string stored as a UTF-8 encoded buffer.
@ -87,8 +86,10 @@ impl String {
    /// Returns the vector as a string buffer, if possible, taking care not to
    /// copy it.
    ///
-    /// Returns `Err` with the original vector if the vector contains invalid
-    /// UTF-8.
+    /// # Failure
+    ///
+    /// If the given vector is not valid UTF-8, then the original vector and the
+    /// corresponding error is returned.
    ///
    /// # Examples
    ///
@ -103,11 +104,10 @@ impl String {
    /// ```
    #[inline]
    #[unstable = "error type may change"]
-    pub fn from_utf8(vec: Vec<u8>) -> Result<String, Vec<u8>> {
-        if str::is_utf8(vec.as_slice()) {
-            Ok(String { vec: vec })
-        } else {
-            Err(vec)
+    pub fn from_utf8(vec: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
+        match str::from_utf8(vec.as_slice()) {
+            Ok(..) => Ok(String { vec: vec }),
+            Err(e) => Err((vec, e))
        }
    }

@ -123,8 +123,9 @@ impl String {
    /// ```
    #[unstable = "return type may change"]
    pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> CowString<'a> {
-        if str::is_utf8(v) {
-            return Cow::Borrowed(unsafe { mem::transmute(v) })
+        match str::from_utf8(v) {
+            Ok(s) => return Cow::Borrowed(s),
+            Err(..) => {}
        }

        static TAG_CONT_U8: u8 = 128u8;
@ -173,7 +174,7 @@ impl String {
            if byte < 128u8 {
                // subseqidx handles this
            } else {
-                let w = str::utf8_char_width(byte);
+                let w = unicode_str::utf8_char_width(byte);

                match w {
                    2 => {
@ -235,7 +236,7 @@ impl String {
                res.as_mut_vec().push_all(v[subseqidx..total])
            };
        }
-        Cow::Owned(res.into_string())
+        Cow::Owned(res)
    }

    /// Decode a UTF-16 encoded vector `v` into a `String`, returning `None`
@ -256,10 +257,10 @@ impl String {
    #[unstable = "error value in return may change"]
    pub fn from_utf16(v: &[u16]) -> Option<String> {
        let mut s = String::with_capacity(v.len());
-        for c in str::utf16_items(v) {
+        for c in unicode_str::utf16_items(v) {
            match c {
-                str::ScalarValue(c) => s.push(c),
-                str::LoneSurrogate(_) => return None
+                Utf16Item::ScalarValue(c) => s.push(c),
+                Utf16Item::LoneSurrogate(_) => return None
            }
        }
        Some(s)
@ -281,7 +282,7 @@ impl String {
    /// ```
    #[stable]
    pub fn from_utf16_lossy(v: &[u16]) -> String {
-        str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
+        unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
    }

    /// Convert a vector of `char`s to a `String`.
@ -812,21 +813,12 @@ impl<'a, 'b> PartialEq<CowString<'a>> for &'b str {
 }

 #[experimental = "waiting on Str stabilization"]
+#[allow(deprecated)]
 impl Str for String {
    #[inline]
    #[stable]
    fn as_slice<'a>(&'a self) -> &'a str {
-        unsafe {
-            mem::transmute(self.vec.as_slice())
-        }
-    }
-}
-
-#[experimental = "waiting on StrAllocating stabilization"]
-impl StrAllocating for String {
-    #[inline]
-    fn into_string(self) -> String {
-        self
+        unsafe { mem::transmute(self.vec.as_slice()) }
    }
 }

@ -841,7 +833,7 @@ impl Default for String {
 #[experimental = "waiting on Show stabilization"]
 impl fmt::Show for String {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.as_slice().fmt(f)
+        (*self).fmt(f)
    }
 }

@ -849,7 +841,7 @@ impl fmt::Show for String {
 impl<H: hash::Writer> hash::Hash<H> for String {
    #[inline]
    fn hash(&self, hasher: &mut H) {
-        self.as_slice().hash(hasher)
+        (*self).hash(hasher)
    }
 }

@ -873,7 +865,7 @@ impl<'a> Add<&'a str, String> for String {
 impl ops::Slice<uint, str> for String {
    #[inline]
    fn as_slice_<'a>(&'a self) -> &'a str {
-        self.as_slice()
+        unsafe { mem::transmute(self.vec.as_slice()) }
    }

    #[inline]
@ -894,7 +886,9 @@ impl ops::Slice<uint, str> for String {

 #[experimental = "waiting on Deref stabilization"]
 impl ops::Deref<str> for String {
-    fn deref<'a>(&'a self) -> &'a str { self.as_slice() }
+    fn deref<'a>(&'a self) -> &'a str {
+        unsafe { mem::transmute(self.vec[]) }
+    }
 }

 /// Wrapper type providing a `&String` reference via `Deref`.
@ -1015,6 +1009,18 @@ pub mod raw {
    }
 }

+/// A clone-on-write string
+#[stable]
+pub type CowString<'a> = Cow<'a, String, str>;
+
+#[allow(deprecated)]
+impl<'a> Str for CowString<'a> {
+    #[inline]
+    fn as_slice<'b>(&'b self) -> &'b str {
+        (**self).as_slice()
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use prelude::*;
--- a/src/libcore/fmt/float.rs
+++ b/src/libcore/fmt/float.rs
@ -23,7 +23,7 @@ use num::cast;
 use ops::FnOnce;
 use result::Result::Ok;
 use slice::{mod, SliceExt};
-use str::StrPrelude;
+use str::StrExt;

 /// A flag that specifies whether to use exponential (scientific) notation.
 pub enum ExponentFormat {
--- a/src/libcore/fmt/mod.rs
+++ b/src/libcore/fmt/mod.rs
@ -24,7 +24,7 @@ use result::Result::{Ok, Err};
 use result;
 use slice::SliceExt;
 use slice;
-use str::StrPrelude;
+use str::{StrExt, Utf8Error};

 pub use self::num::radix;
 pub use self::num::Radix;
@ -795,5 +795,18 @@ impl<'b, T: Show> Show for RefMut<'b, T> {
    }
 }

+impl Show for Utf8Error {
+    fn fmt(&self, f: &mut Formatter) -> Result {
+        match *self {
+            Utf8Error::InvalidByte(n) => {
+                write!(f, "invalid utf-8: invalid byte at index {}", n)
+            }
+            Utf8Error::TooShort => {
+                write!(f, "invalid utf-8: byte slice too short")
+            }
+        }
+    }
+}
+
 // If you expected tests to be here, look instead at the run-pass/ifmt.rs test,
 // it's a lot easier than creating all of the rt::Piece structures here.
--- a/src/libcore/num/mod.rs
+++ b/src/libcore/num/mod.rs
@ -32,7 +32,7 @@ use ops::{Add, Sub, Mul, Div, Rem, Neg};
 use ops::{Not, BitAnd, BitOr, BitXor, Shl, Shr};
 use option::Option;
 use option::Option::{Some, None};
-use str::{FromStr, from_str, StrPrelude};
+use str::{FromStr, from_str, StrExt};

 /// Simultaneous division and remainder
 #[inline]
--- a/src/libcore/prelude.rs
+++ b/src/libcore/prelude.rs
@ -60,7 +60,7 @@ pub use option::Option::{Some, None};
 pub use ptr::RawPtr;
 pub use result::Result;
 pub use result::Result::{Ok, Err};
-pub use str::{Str, StrPrelude};
+pub use str::{Str, StrExt};
 pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4};
 pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8};
 pub use tuple::{Tuple9, Tuple10, Tuple11, Tuple12};
--- a/src/libcore/str.rs
+++ b/src/libcore/str.rs
--- a/src/librustc/lib.rs
+++ b/src/librustc/lib.rs
@ -115,6 +115,7 @@ pub mod util {
    pub mod ppaux;
    pub mod nodemap;
    pub mod snapshot_vec;
+    pub mod lev_distance;
 }

 pub mod lib {
--- a/src/librustc/util/lev_distance.rs
+++ b/src/librustc/util/lev_distance.rs
@ -0,0 +1,63 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::cmp;
+
+pub fn lev_distance(me: &str, t: &str) -> uint {
+    if me.is_empty() { return t.chars().count(); }
+    if t.is_empty() { return me.chars().count(); }
+
+    let mut dcol = Vec::from_fn(t.len() + 1, |x| x);
+    let mut t_last = 0;
+
+    for (i, sc) in me.chars().enumerate() {
+
+        let mut current = i;
+        dcol[0] = current + 1;
+
+        for (j, tc) in t.chars().enumerate() {
+
+            let next = dcol[j + 1];
+
+            if sc == tc {
+                dcol[j + 1] = current;
+            } else {
+                dcol[j + 1] = cmp::min(current, next);
+                dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
+            }
+
+            current = next;
+            t_last = j;
+        }
+    }
+
+    dcol[t_last + 1]
+}
+
+#[test]
+fn test_lev_distance() {
+    use std::char::{ from_u32, MAX };
+    // Test bytelength agnosticity
+    for c in range(0u32, MAX as u32)
+             .filter_map(|i| from_u32(i))
+             .map(|i| String::from_char(1, i)) {
+        assert_eq!(lev_distance(c[], c[]), 0);
+    }
+
+    let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
+    let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
+    let c = "Mary häd ä little lämb\n\nLittle lämb\n";
+    assert_eq!(lev_distance(a, b), 1);
+    assert_eq!(lev_distance(b, a), 1);
+    assert_eq!(lev_distance(a, c), 2);
+    assert_eq!(lev_distance(c, a), 2);
+    assert_eq!(lev_distance(b, c), 1);
+    assert_eq!(lev_distance(c, b), 1);
+}
--- a/src/librustc_resolve/lib.rs
+++ b/src/librustc_resolve/lib.rs
@ -57,6 +57,7 @@ use rustc::middle::privacy::*;
 use rustc::middle::subst::{ParamSpace, FnSpace, TypeSpace};
 use rustc::middle::ty::{CaptureModeMap, Freevar, FreevarMap, TraitMap};
 use rustc::util::nodemap::{NodeMap, NodeSet, DefIdSet, FnvHashMap};
+use rustc::util::lev_distance::lev_distance;

 use syntax::ast::{Arm, BindByRef, BindByValue, BindingMode, Block, Crate, CrateNum};
 use syntax::ast::{DeclItem, DefId, Expr, ExprAgain, ExprBreak, ExprField};
@ -96,8 +97,8 @@ use std::mem::replace;
 use std::rc::{Rc, Weak};
 use std::uint;

-mod check_unused;
-mod record_exports;
+// Definition mapping
+pub type DefMap = RefCell<NodeMap<Def>>;

 #[deriving(Copy)]
 struct BindingInfo {
@ -5539,7 +5540,7 @@ impl<'a> Resolver<'a> {

        let mut smallest = 0;
        for (i, other) in maybes.iter().enumerate() {
-            values[i] = name.lev_distance(other.get());
+            values[i] = lev_distance(name, other.get());

            if values[i] <= values[smallest] {
                smallest = i;
--- a/src/libstd/error.rs
+++ b/src/libstd/error.rs
@ -78,10 +78,9 @@
 //! }
 //! ```

-use option::Option;
-use option::Option::None;
-use kinds::Send;
-use string::String;
+use prelude::*;
+
+use str::Utf8Error;

 /// Base functionality for all errors in Rust.
 pub trait Error: Send {
@ -107,3 +106,14 @@ impl<E> FromError<E> for E {
        err
    }
 }
+
+impl Error for Utf8Error {
+    fn description(&self) -> &str {
+        match *self {
+            Utf8Error::TooShort => "invalid utf-8: not enough bytes",
+            Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents",
+        }
+    }
+
+    fn detail(&self) -> Option<String> { Some(self.to_string()) }
+}
--- a/src/libstd/os.rs
+++ b/src/libstd/os.rs
@ -729,7 +729,7 @@ fn real_args() -> Vec<String> {
        // Push it onto the list.
        let ptr = ptr as *const u16;
        let buf = slice::from_raw_buf(&ptr, len);
-        let opt_s = String::from_utf16(::str::truncate_utf16_at_nul(buf));
+        let opt_s = String::from_utf16(os_imp::truncate_utf16_at_nul(buf));
        opt_s.expect("CommandLineToArgvW returned invalid UTF-16")
    });

--- a/src/libstd/sys/windows/os.rs
+++ b/src/libstd/sys/windows/os.rs
@ -31,6 +31,16 @@ use libc::types::os::arch::extra::DWORD;

 const BUF_BYTES : uint = 2048u;

+/// Return a slice of `v` ending at (and not including) the first NUL
+/// (0).
+pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
+    match v.iter().position(|c| *c == 0) {
+        // don't include the 0
+        Some(i) => v[..i],
+        None => v
+    }
+}
+
 pub fn errno() -> uint {
    use libc::types::os::arch::extra::DWORD;

@ -87,7 +97,7 @@ pub fn error_string(errnum: i32) -> String {
            return format!("OS Error {} (FormatMessageW() returned error {})", errnum, fm_err);
        }

-        let msg = String::from_utf16(::str::truncate_utf16_at_nul(&buf));
+        let msg = String::from_utf16(truncate_utf16_at_nul(&buf));
        match msg {
            Some(msg) => format!("OS Error {}: {}", errnum, msg),
            None => format!("OS Error {} (FormatMessageW() returned invalid UTF-16)", errnum),
@ -294,3 +304,30 @@ pub fn page_size() -> uint {
        return info.dwPageSize as uint;
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::truncate_utf16_at_nul;
+
+    #[test]
+    fn test_truncate_utf16_at_nul() {
+        let v = [];
+        let b: &[u16] = &[];
+        assert_eq!(truncate_utf16_at_nul(&v), b);
+
+        let v = [0, 2, 3];
+        assert_eq!(truncate_utf16_at_nul(&v), b);
+
+        let v = [1, 0, 3];
+        let b: &[u16] = &[1];
+        assert_eq!(truncate_utf16_at_nul(&v), b);
+
+        let v = [1, 2, 0];
+        let b: &[u16] = &[1, 2];
+        assert_eq!(truncate_utf16_at_nul(&v), b);
+
+        let v = [1, 2, 3];
+        let b: &[u16] = &[1, 2, 3];
+        assert_eq!(truncate_utf16_at_nul(&v), b);
+    }
+}
--- a/src/libunicode/lib.rs
+++ b/src/libunicode/lib.rs
@ -28,8 +28,7 @@
       html_root_url = "http://doc.rust-lang.org/nightly/",
       html_playground_url = "http://play.rust-lang.org/")]
 #![no_std]
-#![feature(globs)]
-#![feature(unboxed_closures)]
+#![feature(globs, macro_rules, slicing_syntax, unboxed_closures)]

 extern crate core;

@ -74,11 +73,14 @@ pub mod char {
 }

 pub mod str {
-    pub use u_str::{UnicodeStrPrelude, Words, Graphemes, GraphemeIndices};
+    pub use u_str::{UnicodeStr, Words, Graphemes, GraphemeIndices};
+    pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};
+    pub use u_str::{utf16_items, Utf16Encoder};
 }

-// this lets us use #[deriving(Clone)]
+// this lets us use #[deriving(..)]
 mod std {
    pub use core::clone;
    pub use core::cmp;
+    pub use core::fmt;
 }
--- a/src/libunicode/u_str.rs
+++ b/src/libunicode/u_str.rs
@ -15,24 +15,36 @@
 //! This module provides functionality to `str` that requires the Unicode methods provided by the
 //! UnicodeChar trait.

-use self::GraphemeState::*;
+use core::prelude::*;
+
+use core::char;
 use core::cmp;
-use core::slice::SliceExt;
-use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
 use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt};
+use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
+use core::iter::{Filter, AdditiveIterator};
 use core::kinds::Sized;
-use core::option::Option;
+use core::mem;
+use core::num::Int;
 use core::option::Option::{None, Some};
+use core::option::Option;
+use core::slice::SliceExt;
+use core::slice;
 use core::str::{CharSplits, StrPrelude};
+use core::str::{CharSplits};
+
 use u_char::UnicodeChar;
 use tables::grapheme::GraphemeCat;

 /// An iterator over the words of a string, separated by a sequence of whitespace
 /// FIXME: This should be opaque
-pub type Words<'a> = Filter<&'a str, CharSplits<'a, fn(char) -> bool>, fn(&&str) -> bool>;
+#[stable]
+pub struct Words<'a> {
+    inner: Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>,
+                  fn(&&str) -> bool>,
+}

 /// Methods for Unicode string slices
-pub trait UnicodeStrPrelude for Sized? {
+pub trait UnicodeStr for Sized? {
    /// Returns an iterator over the
    /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
    /// of the string.
@ -77,6 +89,7 @@ pub trait UnicodeStrPrelude for Sized? {
    /// let v: Vec<&str> = some_words.words().collect();
    /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
    /// ```
+    #[stable]
    fn words<'a>(&'a self) -> Words<'a>;

    /// Returns true if the string contains only whitespace.
@ -129,7 +142,7 @@ pub trait UnicodeStrPrelude for Sized? {
    fn trim_right<'a>(&'a self) -> &'a str;
 }

-impl UnicodeStrPrelude for str {
+impl UnicodeStr for str {
    #[inline]
    fn graphemes(&self, is_extended: bool) -> Graphemes {
        Graphemes { string: self, extended: is_extended, cat: None, catb: None }
@ -145,7 +158,7 @@ impl UnicodeStrPrelude for str {
        fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
        fn is_whitespace(c: char) -> bool { c.is_whitespace() }

-        self.split(is_whitespace).filter(is_not_empty)
+        Words { inner: self.split(is_whitespace).filter(is_not_empty) }
    }

    #[inline]
@ -428,3 +441,196 @@ impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
        Some(retstr)
    }
 }
+
+// https://tools.ietf.org/html/rfc3629
+static UTF8_CHAR_WIDTH: [u8, ..256] = [
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
+0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
+3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
+4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
+];
+
+/// Given a first byte, determine how many bytes are in this UTF-8 character
+#[inline]
+pub fn utf8_char_width(b: u8) -> uint {
+    return UTF8_CHAR_WIDTH[b as uint] as uint;
+}
+
+/// Determines if a vector of `u16` contains valid UTF-16
+pub fn is_utf16(v: &[u16]) -> bool {
+    let mut it = v.iter();
+    macro_rules! next ( ($ret:expr) => {
+            match it.next() { Some(u) => *u, None => return $ret }
+        }
+    )
+    loop {
+        let u = next!(true);
+
+        match char::from_u32(u as u32) {
+            Some(_) => {}
+            None => {
+                let u2 = next!(false);
+                if u < 0xD7FF || u > 0xDBFF ||
+                    u2 < 0xDC00 || u2 > 0xDFFF { return false; }
+            }
+        }
+    }
+}
+
+/// An iterator that decodes UTF-16 encoded codepoints from a vector
+/// of `u16`s.
+#[deriving(Clone)]
+pub struct Utf16Items<'a> {
+    iter: slice::Items<'a, u16>
+}
+/// The possibilities for values decoded from a `u16` stream.
+#[deriving(PartialEq, Eq, Clone, Show)]
+pub enum Utf16Item {
+    /// A valid codepoint.
+    ScalarValue(char),
+    /// An invalid surrogate without its pair.
+    LoneSurrogate(u16)
+}
+
+impl Copy for Utf16Item {}
+
+impl Utf16Item {
+    /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
+    /// replacement character (U+FFFD).
+    #[inline]
+    pub fn to_char_lossy(&self) -> char {
+        match *self {
+            Utf16Item::ScalarValue(c) => c,
+            Utf16Item::LoneSurrogate(_) => '\uFFFD'
+        }
+    }
+}
+
+impl<'a> Iterator<Utf16Item> for Utf16Items<'a> {
+    fn next(&mut self) -> Option<Utf16Item> {
+        let u = match self.iter.next() {
+            Some(u) => *u,
+            None => return None
+        };
+
+        if u < 0xD800 || 0xDFFF < u {
+            // not a surrogate
+            Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
+        } else if u >= 0xDC00 {
+            // a trailing surrogate
+            Some(Utf16Item::LoneSurrogate(u))
+        } else {
+            // preserve state for rewinding.
+            let old = self.iter;
+
+            let u2 = match self.iter.next() {
+                Some(u2) => *u2,
+                // eof
+                None => return Some(Utf16Item::LoneSurrogate(u))
+            };
+            if u2 < 0xDC00 || u2 > 0xDFFF {
+                // not a trailing surrogate so we're not a valid
+                // surrogate pair, so rewind to redecode u2 next time.
+                self.iter = old;
+                return Some(Utf16Item::LoneSurrogate(u))
+            }
+
+            // all ok, so lets decode it.
+            let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
+            Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        let (low, high) = self.iter.size_hint();
+        // we could be entirely valid surrogates (2 elements per
+        // char), or entirely non-surrogates (1 element per char)
+        (low / 2, high)
+    }
+}
+
+/// Create an iterator over the UTF-16 encoded codepoints in `v`,
+/// returning invalid surrogates as `LoneSurrogate`s.
+///
+/// # Example
+///
+/// ```rust
+/// use std::str;
+/// use std::str::{ScalarValue, LoneSurrogate};
+///
+/// // 𝄞mus<invalid>ic<invalid>
+/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///          0x0073, 0xDD1E, 0x0069, 0x0063,
+///          0xD834];
+///
+/// assert_eq!(str::utf16_items(&v).collect::<Vec<_>>(),
+///            vec![ScalarValue('𝄞'),
+///                 ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
+///                 LoneSurrogate(0xDD1E),
+///                 ScalarValue('i'), ScalarValue('c'),
+///                 LoneSurrogate(0xD834)]);
+/// ```
+pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
+    Utf16Items { iter : v.iter() }
+}
+
+/// Iterator adaptor for encoding `char`s to UTF-16.
+#[deriving(Clone)]
+pub struct Utf16Encoder<I> {
+    chars: I,
+    extra: u16
+}
+
+impl<I> Utf16Encoder<I> {
+    /// Create an UTF-16 encoder from any `char` iterator.
+    pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<char> {
+        Utf16Encoder { chars: chars, extra: 0 }
+    }
+}
+
+impl<I> Iterator<u16> for Utf16Encoder<I> where I: Iterator<char> {
+    #[inline]
+    fn next(&mut self) -> Option<u16> {
+        if self.extra != 0 {
+            let tmp = self.extra;
+            self.extra = 0;
+            return Some(tmp);
+        }
+
+        let mut buf = [0u16, ..2];
+        self.chars.next().map(|ch| {
+            let n = ch.encode_utf16(buf[mut]).unwrap_or(0);
+            if n == 2 { self.extra = buf[1]; }
+            buf[0]
+        })
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        let (low, high) = self.chars.size_hint();
+        // every char gets either one u16 or two u16,
+        // so this iterator is between 1 or 2 times as
+        // long as the underlying iterator.
+        (low, high.and_then(|n| n.checked_mul(2)))
+    }
+}
+
+impl<'a> Iterator<&'a str> for Words<'a> {
+    fn next(&mut self) -> Option<&'a str> { self.inner.next() }
+}
+impl<'a> DoubleEndedIterator<&'a str> for Words<'a> {
+    fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
+}