Auto merge of #21488 - aturon:os-str, r=alexcrichton

Per [RFC 517](https://github.com/rust-lang/rfcs/pull/575/), this commit introduces platform-native strings. The API is essentially as described in the RFC. The WTF-8 implementation is adapted from @SimonSapin's [implementation](https://github.com/SimonSapin/rust-wtf8). To make this work, some encodign and decoding functionality in `libcore` is now exported in a "raw" fashion reusable for WTF-8. These exports are *not* reexported in `std`, nor are they stable.
2025-05-14 02:49:40 +00:00 · 2015-01-24 19:39:52 +00:00 · 2015-01-24 19:39:52 +00:00 · bb7cc4eb26
commit bb7cc4eb26
parent 76fbb35831 c5369ebc7f
12 changed files with 1850 additions and 92 deletions
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@ -258,49 +258,69 @@ impl CharExt for char {
    #[inline]
    #[unstable = "pending decision about Iterator/Writer/Reader"]
    fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> {
-        // Marked #[inline] to allow llvm optimizing it away
-        let code = self as u32;
-        if code < MAX_ONE_B && dst.len() >= 1 {
-            dst[0] = code as u8;
-            Some(1)
-        } else if code < MAX_TWO_B && dst.len() >= 2 {
-            dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
-            dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(2)
-        } else if code < MAX_THREE_B && dst.len() >= 3  {
-            dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
-            dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(3)
-        } else if dst.len() >= 4 {
-            dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
-            dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(4)
-        } else {
-            None
-        }
+        encode_utf8_raw(self as u32, dst)
    }

    #[inline]
    #[unstable = "pending decision about Iterator/Writer/Reader"]
    fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> {
-        // Marked #[inline] to allow llvm optimizing it away
-        let mut ch = self as u32;
-        if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
-            // The BMP falls through (assuming non-surrogate, as it should)
-            dst[0] = ch as u16;
-            Some(1)
-        } else if dst.len() >= 2 {
-            // Supplementary planes break into surrogates.
-            ch -= 0x1_0000_u32;
-            dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
-            dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
-            Some(2)
-        } else {
-            None
-        }
+        encode_utf16_raw(self as u32, dst)
+    }
+}
+
+/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
+/// and then returns the number of bytes written.
+///
+/// If the buffer is not large enough, nothing will be written into it
+/// and a `None` will be returned.
+#[inline]
+#[unstable]
+pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> {
+    // Marked #[inline] to allow llvm optimizing it away
+    if code < MAX_ONE_B && dst.len() >= 1 {
+        dst[0] = code as u8;
+        Some(1)
+    } else if code < MAX_TWO_B && dst.len() >= 2 {
+        dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
+        dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(2)
+    } else if code < MAX_THREE_B && dst.len() >= 3  {
+        dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
+        dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(3)
+    } else if dst.len() >= 4 {
+        dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
+        dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(4)
+    } else {
+        None
+    }
+}
+
+/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
+/// and then returns the number of `u16`s written.
+///
+/// If the buffer is not large enough, nothing will be written into it
+/// and a `None` will be returned.
+#[inline]
+#[unstable]
+pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> {
+    // Marked #[inline] to allow llvm optimizing it away
+    if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
+        // The BMP falls through (assuming non-surrogate, as it should)
+        dst[0] = ch as u16;
+        Some(1)
+    } else if dst.len() >= 2 {
+        // Supplementary planes break into surrogates.
+        ch -= 0x1_0000_u32;
+        dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
+        dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
+        Some(2)
+    } else {
+        None
    }
 }

--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
    }
 }

+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+#[unstable]
+pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
+    // Decode UTF-8
+    let x = match bytes.next() {
+        None => return None,
+        Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
+        Some(&next_byte) => next_byte,
+    };
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [[[x y] z] w]
+    // NOTE: Performance is sensitive to the exact formulation here
+    let init = utf8_first_byte!(x, 2);
+    let y = unwrap_or_0(bytes.next());
+    let mut ch = utf8_acc_cont_byte!(init, y);
+    if x >= 0xE0 {
+        // [[x y z] w] case
+        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+        let z = unwrap_or_0(bytes.next());
+        let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
+        ch = init << 12 | y_z;
+        if x >= 0xF0 {
+            // [x y z w] case
+            // use only the lower 3 bits of `init`
+            let w = unwrap_or_0(bytes.next());
+            ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
+        }
+    }
+
+    Some(ch)
+}
+
 #[stable]
 impl<'a> Iterator for Chars<'a> {
    type Item = char;

    #[inline]
    fn next(&mut self) -> Option<char> {
-        // Decode UTF-8, using the valid UTF-8 invariant
-        let x = match self.iter.next() {
-            None => return None,
-            Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
-            Some(&next_byte) => next_byte,
-        };
-
-        // Multibyte case follows
-        // Decode from a byte combination out of: [[[x y] z] w]
-        // NOTE: Performance is sensitive to the exact formulation here
-        let init = utf8_first_byte!(x, 2);
-        let y = unwrap_or_0(self.iter.next());
-        let mut ch = utf8_acc_cont_byte!(init, y);
-        if x >= 0xE0 {
-            // [[x y z] w] case
-            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
-            let z = unwrap_or_0(self.iter.next());
-            let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
-            ch = init << 12 | y_z;
-            if x >= 0xF0 {
-                // [x y z w] case
-                // use only the lower 3 bits of `init`
-                let w = unwrap_or_0(self.iter.next());
-                ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
+        next_code_point(&mut self.iter).map(|ch| {
+            // str invariant says `ch` is a valid Unicode Scalar Value
+            unsafe {
+                mem::transmute(ch)
            }
-        }
-
-        // str invariant says `ch` is a valid Unicode Scalar Value
-        unsafe {
-            Some(mem::transmute(ch))
-        }
+        })
    }

    #[inline]
@ -1517,25 +1526,8 @@ impl StrExt for str {

    #[inline]
    fn char_range_at(&self, i: uint) -> CharRange {
-        if self.as_bytes()[i] < 128u8 {
-            return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 };
-        }
-
-        // Multibyte case is a fn to allow char_range_at to inline cleanly
-        fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
-            let mut val = s.as_bytes()[i] as u32;
-            let w = UTF8_CHAR_WIDTH[val as uint] as uint;
-            assert!((w != 0));
-
-            val = utf8_first_byte!(val, w);
-            val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]);
-            if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); }
-            if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); }
-
-            return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w};
-        }
-
-        return multibyte_char_range_at(self, i);
+        let (c, n) = char_range_at_raw(self.as_bytes(), i);
+        CharRange { ch: unsafe { mem::transmute(c) }, next: n }
    }

    #[inline]
@ -1653,6 +1645,32 @@ impl StrExt for str {
    fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) }
 }

+/// Pluck a code point out of a UTF-8-like byte slice and return the
+/// index of the next code point.
+#[inline]
+#[unstable]
+pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) {
+    if bytes[i] < 128u8 {
+        return (bytes[i] as u32, i + 1);
+    }
+
+    // Multibyte case is a fn to allow char_range_at to inline cleanly
+    fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) {
+        let mut val = bytes[i] as u32;
+        let w = UTF8_CHAR_WIDTH[val as uint] as uint;
+        assert!((w != 0));
+
+        val = utf8_first_byte!(val, w);
+        val = utf8_acc_cont_byte!(val, bytes[i + 1]);
+        if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); }
+        if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); }
+
+        return (val, i + w);
+    }
+
+    multibyte_char_range_at(bytes, i)
+}
+
 #[stable]
 impl<'a> Default for &'a str {
    #[stable]
--- a/src/libstd/ffi/mod.rs
+++ b/src/libstd/ffi/mod.rs
@ -17,4 +17,9 @@ pub use self::c_str::CString;
 pub use self::c_str::c_str_to_bytes;
 pub use self::c_str::c_str_to_bytes_with_nul;

+pub use self::os_str::OsString;
+pub use self::os_str::OsStr;
+pub use self::os_str::AsOsStr;
+
 mod c_str;
+mod os_str;
--- a/src/libstd/ffi/os_str.rs
+++ b/src/libstd/ffi/os_str.rs
@ -0,0 +1,259 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! A type that can represent all platform-native strings, but is cheaply
+//! interconvertable with Rust strings.
+//!
+//! The need for this type arises from the fact that:
+//!
+//! * On Unix systems, strings are often arbitrary sequences of non-zero
+//!   bytes, in many cases interpreted as UTF-8.
+//!
+//! * On Windows, strings are often arbitrary sequences of non-zero 16-bit
+//!   values, interpreted as UTF-16 when it is valid to do so.
+//!
+//! * In Rust, strings are always valid UTF-8, but may contain zeros.
+//!
+//! The types in this module bridge this gap by simultaneously representing Rust
+//! and platform-native string values, and in particular allowing a Rust string
+//! to be converted into an "OS" string with no cost.
+//!
+//! **Note**: At the moment, these types are extremely bare-bones, usable only
+//! for conversion to/from various other string types. Eventually these types
+//! will offer a full-fledged string API.
+
+#![unstable = "recently added as part of path/io reform"]
+
+use core::prelude::*;
+
+use core::borrow::{BorrowFrom, ToOwned};
+use fmt::{self, Debug};
+use mem;
+use string::{String, CowString};
+use ops;
+use cmp;
+use hash::{Hash, Hasher, Writer};
+
+use sys::os_str::{Buf, Slice};
+use sys_common::{AsInner, IntoInner, FromInner};
+
+/// Owned, mutable OS strings.
+#[derive(Clone)]
+pub struct OsString {
+    inner: Buf
+}
+
+/// Slices into OS strings.
+pub struct OsStr {
+    inner: Slice
+}
+
+impl OsString {
+    /// Constructs an `OsString` at no cost by consuming a `String`.
+    pub fn from_string(s: String) -> OsString {
+        OsString { inner: Buf::from_string(s) }
+    }
+
+    /// Constructs an `OsString` by copying from a `&str` slice.
+    ///
+    /// Equivalent to: `OsString::from_string(String::from_str(s))`.
+    pub fn from_str(s: &str) -> OsString {
+        OsString { inner: Buf::from_str(s) }
+    }
+
+    /// Convert the `OsString` into a `String` if it contains valid Unicode data.
+    ///
+    /// On failure, ownership of the original `OsString` is returned.
+    pub fn into_string(self) -> Result<String, OsString> {
+        self.inner.into_string().map_err(|buf| OsString { inner: buf} )
+    }
+
+    /// Extend the string with the given `&OsStr` slice.
+    pub fn push_os_str(&mut self, s: &OsStr) {
+        self.inner.push_slice(&s.inner)
+    }
+}
+
+impl ops::Index<ops::FullRange> for OsString {
+    type Output = OsStr;
+
+    #[inline]
+    fn index(&self, _index: &ops::FullRange) -> &OsStr {
+        unsafe { mem::transmute(self.inner.as_slice()) }
+    }
+}
+
+impl ops::Deref for OsString {
+    type Target = OsStr;
+
+    #[inline]
+    fn deref(&self) -> &OsStr {
+        &self[]
+    }
+}
+
+impl Debug for OsString {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        fmt::Debug::fmt(&**self, formatter)
+    }
+}
+
+impl OsStr {
+    /// Coerce directly from a `&str` slice to a `&OsStr` slice.
+    pub fn from_str(s: &str) -> &OsStr {
+        unsafe { mem::transmute(Slice::from_str(s)) }
+    }
+
+    /// Yield a `&str` slice if the `OsStr` is valid unicode.
+    ///
+    /// This conversion may entail doing a check for UTF-8 validity.
+    pub fn to_str(&self) -> Option<&str> {
+        self.inner.to_str()
+    }
+
+    /// Convert an `OsStr` to a `CowString`.
+    ///
+    /// Any non-Unicode sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
+    pub fn to_string_lossy(&self) -> CowString {
+        self.inner.to_string_lossy()
+    }
+
+    /// Copy the slice into an onwed `OsString`.
+    pub fn to_os_string(&self) -> OsString {
+        OsString { inner: self.inner.to_owned() }
+    }
+
+    /// Get the underlying byte representation.
+    ///
+    /// Note: it is *crucial* that this API is private, to avoid
+    /// revealing the internal, platform-specific encodings.
+    fn bytes(&self) -> &[u8] {
+        unsafe { mem::transmute(&self.inner) }
+    }
+}
+
+impl PartialEq for OsStr {
+    fn eq(&self, other: &OsStr) -> bool {
+        self.bytes().eq(other.bytes())
+    }
+}
+
+impl PartialEq<str> for OsStr {
+    fn eq(&self, other: &str) -> bool {
+        *self == *OsStr::from_str(other)
+    }
+}
+
+impl PartialEq<OsStr> for str {
+    fn eq(&self, other: &OsStr) -> bool {
+        *other == *OsStr::from_str(self)
+    }
+}
+
+impl Eq for OsStr {}
+
+impl PartialOrd for OsStr {
+    #[inline]
+    fn partial_cmp(&self, other: &OsStr) -> Option<cmp::Ordering> {
+        self.bytes().partial_cmp(other.bytes())
+    }
+    #[inline]
+    fn lt(&self, other: &OsStr) -> bool { self.bytes().lt(other.bytes()) }
+    #[inline]
+    fn le(&self, other: &OsStr) -> bool { self.bytes().le(other.bytes()) }
+    #[inline]
+    fn gt(&self, other: &OsStr) -> bool { self.bytes().gt(other.bytes()) }
+    #[inline]
+    fn ge(&self, other: &OsStr) -> bool { self.bytes().ge(other.bytes()) }
+}
+
+impl PartialOrd<str> for OsStr {
+    #[inline]
+    fn partial_cmp(&self, other: &str) -> Option<cmp::Ordering> {
+        self.partial_cmp(OsStr::from_str(other))
+    }
+}
+
+// FIXME (#19470): cannot provide PartialOrd<OsStr> for str until we
+// have more flexible coherence rules.
+
+impl Ord for OsStr {
+    #[inline]
+    fn cmp(&self, other: &OsStr) -> cmp::Ordering { self.bytes().cmp(other.bytes()) }
+}
+
+impl<'a, S: Hasher + Writer> Hash<S> for OsStr {
+    #[inline]
+    fn hash(&self, state: &mut S) {
+        self.bytes().hash(state)
+    }
+}
+
+impl Debug for OsStr {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        self.inner.fmt(formatter)
+    }
+}
+
+impl BorrowFrom<OsString> for OsStr {
+    fn borrow_from(owned: &OsString) -> &OsStr { &owned[] }
+}
+
+impl ToOwned<OsString> for OsStr {
+    fn to_owned(&self) -> OsString { self.to_os_string() }
+}
+
+/// Freely convertible to an `&OsStr` slice.
+pub trait AsOsStr {
+    /// Convert to an `&OsStr` slice.
+    fn as_os_str(&self) -> &OsStr;
+}
+
+impl AsOsStr for OsStr {
+    fn as_os_str(&self) -> &OsStr {
+        self
+    }
+}
+
+impl AsOsStr for OsString {
+    fn as_os_str(&self) -> &OsStr {
+        &self[]
+    }
+}
+
+impl AsOsStr for str {
+    fn as_os_str(&self) -> &OsStr {
+        OsStr::from_str(self)
+    }
+}
+
+impl AsOsStr for String {
+    fn as_os_str(&self) -> &OsStr {
+        OsStr::from_str(&self[])
+    }
+}
+
+impl FromInner<Buf> for OsString {
+    fn from_inner(buf: Buf) -> OsString {
+        OsString { inner: buf }
+    }
+}
+
+impl IntoInner<Buf> for OsString {
+    fn into_inner(self) -> Buf {
+        self.inner
+    }
+}
+
+impl AsInner<Slice> for OsStr {
+    fn as_inner(&self) -> &Slice {
+        &self.inner
+    }
+}
--- a/src/libstd/sys/common/mod.rs
+++ b/src/libstd/sys/common/mod.rs
@ -29,6 +29,7 @@ pub mod stack;
 pub mod thread;
 pub mod thread_info;
 pub mod thread_local;
+pub mod wtf8;

 // common error constructors

@ -93,11 +94,21 @@ pub fn keep_going<F>(data: &[u8], mut f: F) -> i64 where
    return (origamt - amt) as i64;
 }

-// A trait for extracting representations from std::io types
-pub trait AsInner<Inner> {
+/// A trait for viewing representations from std types
+pub trait AsInner<Inner: ?Sized> {
    fn as_inner(&self) -> &Inner;
 }

+/// A trait for extracting representations from std types
+pub trait IntoInner<Inner> {
+    fn into_inner(self) -> Inner;
+}
+
+/// A trait for creating std types from internal representations
+pub trait FromInner<Inner> {
+    fn from_inner(inner: Inner) -> Self;
+}
+
 pub trait ProcessConfig<K: BytesContainer, V: BytesContainer> {
    fn program(&self) -> &CString;
    fn args(&self) -> &[CString];
--- a/src/libstd/sys/common/wtf8.rs
+++ b/src/libstd/sys/common/wtf8.rs
--- a/src/libstd/sys/unix/ext.rs
+++ b/src/libstd/sys/unix/ext.rs
@ -31,7 +31,10 @@

 #![unstable]

-use sys_common::AsInner;
+use vec::Vec;
+use sys::os_str::Buf;
+use sys_common::{AsInner, IntoInner, FromInner};
+use ffi::{OsStr, OsString};
 use libc;

 use io;
@ -99,6 +102,36 @@ impl AsRawFd for io::net::udp::UdpSocket {
    }
 }

+// Unix-specific extensions to `OsString`.
+pub trait OsStringExt {
+    /// Create an `OsString` from a byte vector.
+    fn from_vec(vec: Vec<u8>) -> Self;
+
+    /// Yield the underlying byte vector of this `OsString`.
+    fn into_vec(self) -> Vec<u8>;
+}
+
+impl OsStringExt for OsString {
+    fn from_vec(vec: Vec<u8>) -> OsString {
+        FromInner::from_inner(Buf { inner: vec })
+    }
+
+    fn into_vec(self) -> Vec<u8> {
+        self.into_inner().inner
+    }
+}
+
+// Unix-specific extensions to `OsStr`.
+pub trait OsStrExt {
+    fn as_byte_slice(&self) -> &[u8];
+}
+
+impl OsStrExt for OsStr {
+    fn as_byte_slice(&self) -> &[u8] {
+        &self.as_inner().inner
+    }
+}
+
 /// A prelude for conveniently writing platform-specific code.
 ///
 /// Includes all extension traits, and some important type definitions.
--- a/src/libstd/sys/unix/mod.rs
+++ b/src/libstd/sys/unix/mod.rs
@ -44,6 +44,7 @@ pub mod fs;
 pub mod helper_signal;
 pub mod mutex;
 pub mod os;
+pub mod os_str;
 pub mod pipe;
 pub mod process;
 pub mod rwlock;
--- a/src/libstd/sys/unix/os_str.rs
+++ b/src/libstd/sys/unix/os_str.rs
@ -0,0 +1,86 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/// The underlying OsString/OsStr implementation on Unix systems: just
+/// a `Vec<u8>`/`[u8]`.
+
+use core::prelude::*;
+
+use fmt::{self, Debug};
+use vec::Vec;
+use slice::SliceExt as StdSliceExt;
+use str;
+use string::{String, CowString};
+use mem;
+
+#[derive(Clone)]
+pub struct Buf {
+    pub inner: Vec<u8>
+}
+
+pub struct Slice {
+    pub inner: [u8]
+}
+
+impl Debug for Slice {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        self.to_string_lossy().fmt(formatter)
+    }
+}
+
+impl Debug for Buf {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        self.as_slice().fmt(formatter)
+    }
+}
+
+impl Buf {
+    pub fn from_string(s: String) -> Buf {
+        Buf { inner: s.into_bytes() }
+    }
+
+    pub fn from_str(s: &str) -> Buf {
+        Buf { inner: s.as_bytes().to_vec() }
+    }
+
+    pub fn as_slice(&self) -> &Slice {
+        unsafe { mem::transmute(self.inner.as_slice()) }
+    }
+
+    pub fn into_string(self) -> Result<String, Buf> {
+        String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() } )
+    }
+
+    pub fn push_slice(&mut self, s: &Slice) {
+        self.inner.push_all(&s.inner)
+    }
+}
+
+impl Slice {
+    fn from_u8_slice(s: &[u8]) -> &Slice {
+        unsafe { mem::transmute(s) }
+    }
+
+    pub fn from_str(s: &str) -> &Slice {
+        unsafe { mem::transmute(s.as_bytes()) }
+    }
+
+    pub fn to_str(&self) -> Option<&str> {
+        str::from_utf8(&self.inner).ok()
+    }
+
+    pub fn to_string_lossy(&self) -> CowString {
+        String::from_utf8_lossy(&self.inner)
+    }
+
+    pub fn to_owned(&self) -> Buf {
+        Buf { inner: self.inner.to_vec() }
+    }
+}
--- a/src/libstd/sys/windows/ext.rs
+++ b/src/libstd/sys/windows/ext.rs
@ -16,7 +16,11 @@

 #![unstable]

-use sys_common::AsInner;
+pub use sys_common::wtf8::{Wtf8Buf, EncodeWide};
+
+use sys::os_str::Buf;
+use sys_common::{AsInner, FromInner};
+use ffi::{OsStr, OsString};
 use libc;

 use io;
@ -92,9 +96,35 @@ impl AsRawSocket for io::net::udp::UdpSocket {
    }
 }

+// Windows-specific extensions to `OsString`.
+pub trait OsStringExt {
+    /// Create an `OsString` from a potentially ill-formed UTF-16 slice of 16-bit code units.
+    ///
+    /// This is lossless: calling `.encode_wide()` on the resulting string
+    /// will always return the original code units.
+    fn from_wide(wide: &[u16]) -> Self;
+}
+
+impl OsStringExt for OsString {
+    fn from_wide(wide: &[u16]) -> OsString {
+        FromInner::from_inner(Buf { inner: Wtf8Buf::from_wide(wide) })
+    }
+}
+
+// Windows-specific extensions to `OsStr`.
+pub trait OsStrExt {
+    fn encode_wide(&self) -> EncodeWide;
+}
+
+impl OsStrExt for OsStr {
+    fn encode_wide(&self) -> EncodeWide {
+        self.as_inner().inner.encode_wide()
+    }
+}
+
 /// A prelude for conveniently writing platform-specific code.
 ///
 /// Includes all extension traits, and some important type definitions.
 pub mod prelude {
-    pub use super::{Socket, Handle, AsRawSocket, AsRawHandle};
+    pub use super::{Socket, Handle, AsRawSocket, AsRawHandle, OsStrExt, OsStringExt};
 }
--- a/src/libstd/sys/windows/mod.rs
+++ b/src/libstd/sys/windows/mod.rs
@ -44,6 +44,7 @@ pub mod fs;
 pub mod helper_signal;
 pub mod mutex;
 pub mod os;
+pub mod os_str;
 pub mod pipe;
 pub mod process;
 pub mod rwlock;
--- a/src/libstd/sys/windows/os_str.rs
+++ b/src/libstd/sys/windows/os_str.rs
@ -0,0 +1,82 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/// The underlying OsString/OsStr implementation on Windows is a
+/// wrapper around the "WTF-8" encoding; see the `wtf8` module for more.
+
+use fmt::{self, Debug};
+use sys_common::wtf8::{Wtf8, Wtf8Buf};
+use string::{String, CowString};
+use result::Result;
+use option::Option;
+use mem;
+
+#[derive(Clone)]
+pub struct Buf {
+    pub inner: Wtf8Buf
+}
+
+impl Debug for Buf {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        self.as_slice().fmt(formatter)
+    }
+}
+
+pub struct Slice {
+    pub inner: Wtf8
+}
+
+impl Debug for Slice {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        self.inner.fmt(formatter)
+    }
+}
+
+impl Buf {
+    pub fn from_string(s: String) -> Buf {
+        Buf { inner: Wtf8Buf::from_string(s) }
+    }
+
+    pub fn from_str(s: &str) -> Buf {
+        Buf { inner: Wtf8Buf::from_str(s) }
+    }
+
+    pub fn as_slice(&self) -> &Slice {
+        unsafe { mem::transmute(self.inner.as_slice()) }
+    }
+
+    pub fn into_string(self) -> Result<String, Buf> {
+        self.inner.into_string().map_err(|buf| Buf { inner: buf })
+    }
+
+    pub fn push_slice(&mut self, s: &Slice) {
+        self.inner.push_wtf8(&s.inner)
+    }
+}
+
+impl Slice {
+    pub fn from_str(s: &str) -> &Slice {
+        unsafe { mem::transmute(Wtf8::from_str(s)) }
+    }
+
+    pub fn to_str(&self) -> Option<&str> {
+        self.inner.as_str()
+    }
+
+    pub fn to_string_lossy(&self) -> CowString {
+        self.inner.to_string_lossy()
+    }
+
+    pub fn to_owned(&self) -> Buf {
+        let mut buf = Wtf8Buf::with_capacity(self.inner.len());
+        buf.push_wtf8(&self.inner);
+        Buf { inner: buf }
+    }
+}