Auto merge of #40189 - SimonSapin:one-width, r=alexcrichton

Reduce std_unicode’s public API * Only keep one copy of the `UTF8_CHAR_WIDTH` table instead of one of each of libcore and libstd_unicode. * Move the `utf8_char_width` function to `core::str` under the `str_internals` unstable feature. * Remove `std_unicode::str::is_utf16`. It was only accessible through the `#[unstable]` crate std_unicode. It has never been used in the compiler or standard library since 47e7a05 added it in 2012 “for OS API interop”. It can be replaced with a one-liner: ```rust fn is_utf16(slice: &[u16]) -> bool { std::char::decode_utf16(s).all(|r| r.is_ok()) } ```
2024-11-22 23:04:33 +00:00 · 2017-03-03 09:57:57 +00:00 · 2017-03-03 09:57:57 +00:00 · 1476105dd3
commit 1476105dd3
parent 042728e7ff 24b39c51af
8 changed files with 12 additions and 118 deletions
--- a/src/libcollections/lib.rs
+++ b/src/libcollections/lib.rs
@ -54,6 +54,7 @@
 #![feature(slice_patterns)]
 #![feature(specialization)]
 #![feature(staged_api)]
+#![feature(str_internals)]
 #![feature(trusted_len)]
 #![feature(unicode)]
 #![feature(unique)]
--- a/src/libcollections/string.rs
+++ b/src/libcollections/string.rs
@ -62,9 +62,9 @@ use core::iter::{FromIterator, FusedIterator};
 use core::mem;
 use core::ops::{self, Add, AddAssign, Index, IndexMut};
 use core::ptr;
+use core::str as core_str;
 use core::str::pattern::Pattern;
 use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
-use std_unicode::str as unicode_str;

 use borrow::{Cow, ToOwned};
 use range::RangeArgument;
@ -575,7 +575,7 @@ impl String {
            if byte < 128 {
                // subseqidx handles this
            } else {
-                let w = unicode_str::utf8_char_width(byte);
+                let w = core_str::utf8_char_width(byte);

                match w {
                    2 => {
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@ -540,71 +540,6 @@ fn from_utf8_mostly_ascii() {
    }
 }

-#[test]
-fn test_is_utf16() {
-    use std_unicode::str::is_utf16;
-
-    macro_rules! pos {
-        ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } }
-    }
-
-    // non-surrogates
-    pos!(&[0x0000],
-         &[0x0001, 0x0002],
-         &[0xD7FF],
-         &[0xE000]);
-
-    // surrogate pairs (randomly generated with Python 3's
-    // .encode('utf-16be'))
-    pos!(&[0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
-         &[0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
-         &[0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
-
-    // mixtures (also random)
-    pos!(&[0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
-         &[0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
-         &[0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
-
-    // negative tests
-    macro_rules! neg {
-        ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } }
-    }
-
-    neg!(
-        // surrogate + regular unit
-        &[0xdb45, 0x0000],
-        // surrogate + lead surrogate
-        &[0xd900, 0xd900],
-        // unterminated surrogate
-        &[0xd8ff],
-        // trail surrogate without a lead
-        &[0xddb7]);
-
-    // random byte sequences that Python 3's .decode('utf-16be')
-    // failed on
-    neg!(&[0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
-         &[0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
-         &[0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
-         &[0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
-         &[0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
-         &[0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
-         &[0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
-         &[0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
-         &[0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
-         &[0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
-         &[0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
-         &[0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
-         &[0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
-         &[0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
-         &[0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
-         &[0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
-         &[0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
-         &[0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
-         &[0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
-         &[0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
-         &[0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
-}
-
 #[test]
 fn test_as_bytes() {
    // no null
--- a/src/libcollectionstest/string.rs
+++ b/src/libcollectionstest/string.rs
@ -129,7 +129,7 @@ fn test_from_utf16() {
        let s_as_utf16 = s.encode_utf16().collect::<Vec<u16>>();
        let u_as_string = String::from_utf16(&u).unwrap();

-        assert!(::std_unicode::str::is_utf16(&u));
+        assert!(::std_unicode::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok()));
        assert_eq!(s_as_utf16, u);

        assert_eq!(u_as_string, s);
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@ -1352,6 +1352,13 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [
 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 ];

+/// Given a first byte, determine how many bytes are in this UTF-8 character
+#[unstable(feature = "str_internals", issue = "0")]
+#[inline]
+pub fn utf8_char_width(b: u8) -> usize {
+    return UTF8_CHAR_WIDTH[b as usize] as usize;
+}
+
 /// Mask of the value bits of a continuation byte
 const CONT_MASK: u8 = 0b0011_1111;
 /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
--- a/src/libstd/io/mod.rs
+++ b/src/libstd/io/mod.rs
@ -256,7 +256,7 @@
 #![stable(feature = "rust1", since = "1.0.0")]

 use cmp;
-use std_unicode::str as core_str;
+use core::str as core_str;
 use error as std_error;
 use fmt;
 use result;
--- a/src/libstd_unicode/lib.rs
+++ b/src/libstd_unicode/lib.rs
@ -47,7 +47,6 @@ pub mod char;
 #[allow(deprecated)]
 pub mod str {
    pub use u_str::{SplitWhitespace, UnicodeStr};
-    pub use u_str::{is_utf16, utf8_char_width};
    pub use u_str::Utf16Encoder;
 }

--- a/src/libstd_unicode/u_str.rs
+++ b/src/libstd_unicode/u_str.rs
@ -77,54 +77,6 @@ impl UnicodeStr for str {
    }
 }

-// https://tools.ietf.org/html/rfc3629
-static UTF8_CHAR_WIDTH: [u8; 256] = [
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
-0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
-4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
-];
-
-/// Given a first byte, determine how many bytes are in this UTF-8 character
-#[inline]
-pub fn utf8_char_width(b: u8) -> usize {
-    return UTF8_CHAR_WIDTH[b as usize] as usize;
-}
-
-/// Determines if a vector of `u16` contains valid UTF-16
-pub fn is_utf16(v: &[u16]) -> bool {
-    let mut it = v.iter();
-    macro_rules! next { ($ret:expr) => {
-            match it.next() { Some(u) => *u, None => return $ret }
-        }
-    }
-    loop {
-        let u = next!(true);
-
-        match char::from_u32(u as u32) {
-            Some(_) => {}
-            None => {
-                let u2 = next!(false);
-                if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
-                    return false;
-                }
-            }
-        }
-    }
-}
-
 /// Iterator adaptor for encoding `char`s to UTF-16.
 #[derive(Clone)]
 pub struct Utf16Encoder<I> {