mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-22 23:04:33 +00:00
Auto merge of #96869 - sunfishcode:main, r=joshtriplett
Optimize `Wtf8Buf::into_string` for the case where it contains UTF-8. Add a `is_known_utf8` flag to `Wtf8Buf`, which tracks whether the string is known to contain UTF-8. This is efficiently computed in many common situations, such as when a `Wtf8Buf` is constructed from a `String` or `&str`, or with `Wtf8Buf::from_wide` which is already doing UTF-16 decoding and already checking for surrogates. This makes `OsString::into_string` O(1) rather than O(N) on Windows in common cases. And, it eliminates the need to scan through the string for surrogates in `Args::next` and `Vars::next`, because the strings are already being translated with `Wtf8Buf::from_wide`. Many things on Windows construct `OsString`s with `Wtf8Buf::from_wide`, such as `DirEntry::file_name` and `fs::read_link`, so with this patch, users of those functions can subsequently call `.into_string()` without paying for an extra scan through the string for surrogates. r? `@ghost`
This commit is contained in:
commit
25ea5a36c6
@ -164,9 +164,7 @@ impl Slice {
|
||||
}
|
||||
|
||||
pub fn to_owned(&self) -> Buf {
|
||||
let mut buf = Wtf8Buf::with_capacity(self.inner.len());
|
||||
buf.push_wtf8(&self.inner);
|
||||
Buf { inner: buf }
|
||||
Buf { inner: self.inner.to_owned() }
|
||||
}
|
||||
|
||||
pub fn clone_into(&self, buf: &mut Buf) {
|
||||
|
@ -89,6 +89,24 @@ impl CodePoint {
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Returns the numeric value of the code point if it is a leading surrogate.
|
||||
#[inline]
|
||||
pub fn to_lead_surrogate(&self) -> Option<u16> {
|
||||
match self.value {
|
||||
lead @ 0xD800..=0xDBFF => Some(lead as u16),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the numeric value of the code point if it is a trailing surrogate.
|
||||
#[inline]
|
||||
pub fn to_trail_surrogate(&self) -> Option<u16> {
|
||||
match self.value {
|
||||
trail @ 0xDC00..=0xDFFF => Some(trail as u16),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Optionally returns a Unicode scalar value for the code point.
|
||||
///
|
||||
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
|
||||
@ -117,6 +135,14 @@ impl CodePoint {
|
||||
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
|
||||
pub struct Wtf8Buf {
|
||||
bytes: Vec<u8>,
|
||||
|
||||
/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
|
||||
/// know this if we're constructed from a `String` or `&str`.
|
||||
///
|
||||
/// It is possible for `bytes` to have valid UTF-8 without this being
|
||||
/// set, such as when we're concatenating `&Wtf8`'s and surrogates become
|
||||
/// paired, as we don't bother to rescan the entire string.
|
||||
is_known_utf8: bool,
|
||||
}
|
||||
|
||||
impl ops::Deref for Wtf8Buf {
|
||||
@ -147,13 +173,13 @@ impl Wtf8Buf {
|
||||
/// Creates a new, empty WTF-8 string.
|
||||
#[inline]
|
||||
pub fn new() -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: Vec::new() }
|
||||
Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
|
||||
}
|
||||
|
||||
/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
|
||||
#[inline]
|
||||
pub fn with_capacity(capacity: usize) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: Vec::with_capacity(capacity) }
|
||||
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
|
||||
}
|
||||
|
||||
/// Creates a WTF-8 string from a UTF-8 `String`.
|
||||
@ -163,7 +189,7 @@ impl Wtf8Buf {
|
||||
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
|
||||
#[inline]
|
||||
pub fn from_string(string: String) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: string.into_bytes() }
|
||||
Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
|
||||
}
|
||||
|
||||
/// Creates a WTF-8 string from a UTF-8 `&str` slice.
|
||||
@ -173,11 +199,12 @@ impl Wtf8Buf {
|
||||
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
|
||||
#[inline]
|
||||
pub fn from_str(str: &str) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
|
||||
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true }
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.bytes.clear()
|
||||
self.bytes.clear();
|
||||
self.is_known_utf8 = true;
|
||||
}
|
||||
|
||||
/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
|
||||
@ -193,9 +220,11 @@ impl Wtf8Buf {
|
||||
let surrogate = surrogate.unpaired_surrogate();
|
||||
// Surrogates are known to be in the code point range.
|
||||
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
|
||||
// The string will now contain an unpaired surrogate.
|
||||
string.is_known_utf8 = false;
|
||||
// Skip the WTF-8 concatenation check,
|
||||
// surrogate pairs are already decoded by decode_utf16
|
||||
string.push_code_point_unchecked(code_point)
|
||||
string.push_code_point_unchecked(code_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -203,7 +232,7 @@ impl Wtf8Buf {
|
||||
}
|
||||
|
||||
/// Copied from String::push
|
||||
/// This does **not** include the WTF-8 concatenation check.
|
||||
/// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
|
||||
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
|
||||
let mut bytes = [0; 4];
|
||||
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
|
||||
@ -217,6 +246,9 @@ impl Wtf8Buf {
|
||||
|
||||
#[inline]
|
||||
pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
|
||||
// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
|
||||
// cause them to change from well-formed UTF-8 to ill-formed UTF-8,
|
||||
// which would break the assumptions of the `is_known_utf8` field.
|
||||
unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
|
||||
}
|
||||
|
||||
@ -314,7 +346,15 @@ impl Wtf8Buf {
|
||||
self.push_char(decode_surrogate_pair(lead, trail));
|
||||
self.bytes.extend_from_slice(other_without_trail_surrogate);
|
||||
}
|
||||
_ => self.bytes.extend_from_slice(&other.bytes),
|
||||
_ => {
|
||||
// If we'll be pushing a string containing a surrogate, we may
|
||||
// no longer have UTF-8.
|
||||
if other.next_surrogate(0).is_some() {
|
||||
self.is_known_utf8 = false;
|
||||
}
|
||||
|
||||
self.bytes.extend_from_slice(&other.bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -331,13 +371,19 @@ impl Wtf8Buf {
|
||||
/// like concatenating ill-formed UTF-16 strings effectively would.
|
||||
#[inline]
|
||||
pub fn push(&mut self, code_point: CodePoint) {
|
||||
if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
|
||||
if let Some(trail) = code_point.to_trail_surrogate() {
|
||||
if let Some(lead) = (&*self).final_lead_surrogate() {
|
||||
let len_without_lead_surrogate = self.len() - 3;
|
||||
self.bytes.truncate(len_without_lead_surrogate);
|
||||
self.push_char(decode_surrogate_pair(lead, trail as u16));
|
||||
self.push_char(decode_surrogate_pair(lead, trail));
|
||||
return;
|
||||
}
|
||||
|
||||
// We're pushing a trailing surrogate.
|
||||
self.is_known_utf8 = false;
|
||||
} else if code_point.to_lead_surrogate().is_some() {
|
||||
// We're pushing a leading surrogate.
|
||||
self.is_known_utf8 = false;
|
||||
}
|
||||
|
||||
// No newly paired surrogates at the boundary.
|
||||
@ -364,9 +410,10 @@ impl Wtf8Buf {
|
||||
/// (that is, if the string contains surrogates),
|
||||
/// the original WTF-8 string is returned instead.
|
||||
pub fn into_string(self) -> Result<String, Wtf8Buf> {
|
||||
match self.next_surrogate(0) {
|
||||
None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
|
||||
Some(_) => Err(self),
|
||||
if self.is_known_utf8 || self.next_surrogate(0).is_none() {
|
||||
Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
|
||||
} else {
|
||||
Err(self)
|
||||
}
|
||||
}
|
||||
|
||||
@ -376,6 +423,11 @@ impl Wtf8Buf {
|
||||
///
|
||||
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “<>”)
|
||||
pub fn into_string_lossy(mut self) -> String {
|
||||
// Fast path: If we already have UTF-8, we can return it immediately.
|
||||
if self.is_known_utf8 {
|
||||
return unsafe { String::from_utf8_unchecked(self.bytes) };
|
||||
}
|
||||
|
||||
let mut pos = 0;
|
||||
loop {
|
||||
match self.next_surrogate(pos) {
|
||||
@ -398,7 +450,7 @@ impl Wtf8Buf {
|
||||
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
|
||||
pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
|
||||
let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
|
||||
Wtf8Buf { bytes: bytes.into_vec() }
|
||||
Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
|
||||
}
|
||||
}
|
||||
|
||||
@ -576,6 +628,11 @@ impl Wtf8 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
|
||||
pub fn to_owned(&self) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false }
|
||||
}
|
||||
|
||||
/// Lossily converts the string to UTF-8.
|
||||
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
|
||||
///
|
||||
@ -665,7 +722,8 @@ impl Wtf8 {
|
||||
}
|
||||
|
||||
pub fn clone_into(&self, buf: &mut Wtf8Buf) {
|
||||
self.bytes.clone_into(&mut buf.bytes)
|
||||
buf.is_known_utf8 = false;
|
||||
self.bytes.clone_into(&mut buf.bytes);
|
||||
}
|
||||
|
||||
/// Boxes this `Wtf8`.
|
||||
@ -705,12 +763,12 @@ impl Wtf8 {
|
||||
|
||||
#[inline]
|
||||
pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
|
||||
Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
|
||||
Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
|
||||
Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
@ -19,6 +19,36 @@ fn code_point_to_u32() {
|
||||
assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_point_to_lead_surrogate() {
|
||||
fn c(value: u32) -> CodePoint {
|
||||
CodePoint::from_u32(value).unwrap()
|
||||
}
|
||||
assert_eq!(c(0).to_lead_surrogate(), None);
|
||||
assert_eq!(c(0xE9).to_lead_surrogate(), None);
|
||||
assert_eq!(c(0xD800).to_lead_surrogate(), Some(0xD800));
|
||||
assert_eq!(c(0xDBFF).to_lead_surrogate(), Some(0xDBFF));
|
||||
assert_eq!(c(0xDC00).to_lead_surrogate(), None);
|
||||
assert_eq!(c(0xDFFF).to_lead_surrogate(), None);
|
||||
assert_eq!(c(0x1F4A9).to_lead_surrogate(), None);
|
||||
assert_eq!(c(0x10FFFF).to_lead_surrogate(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_point_to_trail_surrogate() {
|
||||
fn c(value: u32) -> CodePoint {
|
||||
CodePoint::from_u32(value).unwrap()
|
||||
}
|
||||
assert_eq!(c(0).to_trail_surrogate(), None);
|
||||
assert_eq!(c(0xE9).to_trail_surrogate(), None);
|
||||
assert_eq!(c(0xD800).to_trail_surrogate(), None);
|
||||
assert_eq!(c(0xDBFF).to_trail_surrogate(), None);
|
||||
assert_eq!(c(0xDC00).to_trail_surrogate(), Some(0xDC00));
|
||||
assert_eq!(c(0xDFFF).to_trail_surrogate(), Some(0xDFFF));
|
||||
assert_eq!(c(0x1F4A9).to_trail_surrogate(), None);
|
||||
assert_eq!(c(0x10FFFF).to_trail_surrogate(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_point_from_char() {
|
||||
assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
|
||||
@ -70,35 +100,66 @@ fn wtf8buf_from_string() {
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_from_wide() {
|
||||
assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b"");
|
||||
assert_eq!(
|
||||
Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes,
|
||||
b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"
|
||||
);
|
||||
let buf = Wtf8Buf::from_wide(&[]);
|
||||
assert_eq!(buf.bytes, b"");
|
||||
assert!(buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xDCA9]);
|
||||
assert_eq!(buf.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert!(buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);
|
||||
assert_eq!(buf.bytes, b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9");
|
||||
assert!(!buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0xD800]);
|
||||
assert_eq!(buf.bytes, b"\xED\xA0\x80");
|
||||
assert!(!buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0xDBFF]);
|
||||
assert_eq!(buf.bytes, b"\xED\xAF\xBF");
|
||||
assert!(!buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0xDC00]);
|
||||
assert_eq!(buf.bytes, b"\xED\xB0\x80");
|
||||
assert!(!buf.is_known_utf8);
|
||||
|
||||
let buf = Wtf8Buf::from_wide(&[0xDFFF]);
|
||||
assert_eq!(buf.bytes, b"\xED\xBF\xBF");
|
||||
assert!(!buf.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_push_str() {
|
||||
let mut string = Wtf8Buf::new();
|
||||
assert_eq!(string.bytes, b"");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.push_str("aé 💩");
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_push_char() {
|
||||
let mut string = Wtf8Buf::from_str("aé ");
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 ");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.push_char('💩');
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_push() {
|
||||
let mut string = Wtf8Buf::from_str("aé ");
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 ");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.push(CodePoint::from_char('💩'));
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
fn c(value: u32) -> CodePoint {
|
||||
CodePoint::from_u32(value).unwrap()
|
||||
@ -106,37 +167,46 @@ fn wtf8buf_push() {
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xD83D)); // lead
|
||||
assert!(!string.is_known_utf8);
|
||||
string.push(c(0xDCA9)); // trail
|
||||
assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic!
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xD83D)); // lead
|
||||
assert!(!string.is_known_utf8);
|
||||
string.push(c(0x20)); // not surrogate
|
||||
string.push(c(0xDCA9)); // trail
|
||||
assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xD800)); // lead
|
||||
assert!(!string.is_known_utf8);
|
||||
string.push(c(0xDBFF)); // lead
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xD800)); // lead
|
||||
assert!(!string.is_known_utf8);
|
||||
string.push(c(0xE000)); // not surrogate
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xD7FF)); // not surrogate
|
||||
assert!(string.is_known_utf8);
|
||||
string.push(c(0xDC00)); // trail
|
||||
assert!(!string.is_known_utf8);
|
||||
assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0x61)); // not surrogate, < 3 bytes
|
||||
assert!(string.is_known_utf8);
|
||||
string.push(c(0xDC00)); // trail
|
||||
assert!(!string.is_known_utf8);
|
||||
assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push(c(0xDC00)); // trail
|
||||
assert!(!string.is_known_utf8);
|
||||
assert_eq!(string.bytes, b"\xED\xB0\x80");
|
||||
}
|
||||
|
||||
@ -146,6 +216,7 @@ fn wtf8buf_push_wtf8() {
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9");
|
||||
string.push_wtf8(Wtf8::from_str(" 💩"));
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
fn w(v: &[u8]) -> &Wtf8 {
|
||||
unsafe { Wtf8::from_bytes_unchecked(v) }
|
||||
@ -161,37 +232,68 @@ fn wtf8buf_push_wtf8() {
|
||||
string.push_wtf8(w(b" ")); // not surrogate
|
||||
string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
|
||||
assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
|
||||
assert!(!string.is_known_utf8);
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push_wtf8(w(b"\xED\xA0\x80")); // lead
|
||||
string.push_wtf8(w(b"\xED\xAF\xBF")); // lead
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
|
||||
assert!(!string.is_known_utf8);
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push_wtf8(w(b"\xED\xA0\x80")); // lead
|
||||
string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate
|
||||
string.push_wtf8(w(b"\xED\xB0\x80")); // trail
|
||||
assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes
|
||||
string.push_wtf8(w(b"\xED\xB0\x80")); // trail
|
||||
assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
|
||||
let mut string = Wtf8Buf::new();
|
||||
string.push_wtf8(w(b"\xED\xB0\x80")); // trail
|
||||
assert_eq!(string.bytes, b"\xED\xB0\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_truncate() {
|
||||
let mut string = Wtf8Buf::from_str("aé");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.truncate(3);
|
||||
assert_eq!(string.bytes, b"a\xC3\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.truncate(1);
|
||||
assert_eq!(string.bytes, b"a");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.truncate(0);
|
||||
assert_eq!(string.bytes, b"");
|
||||
assert!(string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_truncate_around_non_bmp() {
|
||||
let mut string = Wtf8Buf::from_str("💩");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.truncate(4);
|
||||
assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9");
|
||||
assert!(string.is_known_utf8);
|
||||
|
||||
string.truncate(0);
|
||||
assert_eq!(string.bytes, b"");
|
||||
assert!(string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -208,11 +310,37 @@ fn wtf8buf_truncate_fail_longer() {
|
||||
string.truncate(4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn wtf8buf_truncate_splitting_non_bmp3() {
|
||||
let mut string = Wtf8Buf::from_str("💩");
|
||||
assert!(string.is_known_utf8);
|
||||
string.truncate(3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn wtf8buf_truncate_splitting_non_bmp2() {
|
||||
let mut string = Wtf8Buf::from_str("💩");
|
||||
assert!(string.is_known_utf8);
|
||||
string.truncate(2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn wtf8buf_truncate_splitting_non_bmp1() {
|
||||
let mut string = Wtf8Buf::from_str("💩");
|
||||
assert!(string.is_known_utf8);
|
||||
string.truncate(1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8buf_into_string() {
|
||||
let mut string = Wtf8Buf::from_str("aé 💩");
|
||||
assert!(string.is_known_utf8);
|
||||
assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩")));
|
||||
string.push(CodePoint::from_u32(0xD800).unwrap());
|
||||
assert!(!string.is_known_utf8);
|
||||
assert_eq!(string.clone().into_string(), Err(string));
|
||||
}
|
||||
|
||||
@ -229,15 +357,33 @@ fn wtf8buf_from_iterator() {
|
||||
fn f(values: &[u32]) -> Wtf8Buf {
|
||||
values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
|
||||
}
|
||||
assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert_eq!(
|
||||
f(&[0x61, 0xE9, 0x20, 0x1F4A9]),
|
||||
Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true }
|
||||
);
|
||||
|
||||
assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
|
||||
assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
|
||||
assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
|
||||
assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
|
||||
assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
|
||||
assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80");
|
||||
assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80");
|
||||
assert_eq!(
|
||||
f(&[0xD83D, 0x20, 0xDCA9]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
f(&[0xD800, 0xDBFF]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
f(&[0xD800, 0xE000]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
f(&[0xD7FF, 0xDC00]),
|
||||
Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
f(&[0x61, 0xDC00]),
|
||||
Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(f(&[0xDC00]), Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false });
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -251,15 +397,36 @@ fn wtf8buf_extend() {
|
||||
string
|
||||
}
|
||||
|
||||
assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
|
||||
assert_eq!(
|
||||
e(&[0x61, 0xE9], &[0x20, 0x1F4A9]),
|
||||
Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true }
|
||||
);
|
||||
|
||||
assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
|
||||
assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
|
||||
assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
|
||||
assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
|
||||
assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
|
||||
assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80");
|
||||
assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80");
|
||||
assert_eq!(
|
||||
e(&[0xD83D, 0x20], &[0xDCA9]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
e(&[0xD800], &[0xDBFF]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
e(&[0xD800], &[0xE000]),
|
||||
Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
e(&[0xD7FF], &[0xDC00]),
|
||||
Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
e(&[0x61], &[0xDC00]),
|
||||
Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
assert_eq!(
|
||||
e(&[], &[0xDC00]),
|
||||
Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false }
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -407,3 +574,93 @@ fn wtf8_encode_wide_size_hint() {
|
||||
assert_eq!((0, Some(0)), iter.size_hint());
|
||||
assert!(iter.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_clone_into() {
|
||||
let mut string = Wtf8Buf::new();
|
||||
Wtf8::from_str("green").clone_into(&mut string);
|
||||
assert_eq!(string.bytes, b"green");
|
||||
|
||||
let mut string = Wtf8Buf::from_str("green");
|
||||
Wtf8::from_str("").clone_into(&mut string);
|
||||
assert_eq!(string.bytes, b"");
|
||||
|
||||
let mut string = Wtf8Buf::from_str("red");
|
||||
Wtf8::from_str("green").clone_into(&mut string);
|
||||
assert_eq!(string.bytes, b"green");
|
||||
|
||||
let mut string = Wtf8Buf::from_str("green");
|
||||
Wtf8::from_str("red").clone_into(&mut string);
|
||||
assert_eq!(string.bytes, b"red");
|
||||
|
||||
let mut string = Wtf8Buf::from_str("green");
|
||||
assert!(string.is_known_utf8);
|
||||
unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").clone_into(&mut string) };
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_to_ascii_lowercase() {
|
||||
let lowercase = Wtf8::from_str("").to_ascii_lowercase();
|
||||
assert_eq!(lowercase.bytes, b"");
|
||||
|
||||
let lowercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_lowercase();
|
||||
assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87");
|
||||
|
||||
let lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_lowercase() };
|
||||
assert_eq!(lowercase.bytes, b"\xED\xA0\x80");
|
||||
assert!(!lowercase.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_to_ascii_uppercase() {
|
||||
let uppercase = Wtf8::from_str("").to_ascii_uppercase();
|
||||
assert_eq!(uppercase.bytes, b"");
|
||||
|
||||
let uppercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_uppercase();
|
||||
assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87");
|
||||
|
||||
let uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_uppercase() };
|
||||
assert_eq!(uppercase.bytes, b"\xED\xA0\x80");
|
||||
assert!(!uppercase.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_make_ascii_lowercase() {
|
||||
let mut lowercase = Wtf8Buf::from_str("");
|
||||
lowercase.make_ascii_lowercase();
|
||||
assert_eq!(lowercase.bytes, b"");
|
||||
|
||||
let mut lowercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇");
|
||||
lowercase.make_ascii_lowercase();
|
||||
assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87");
|
||||
|
||||
let mut lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() };
|
||||
lowercase.make_ascii_lowercase();
|
||||
assert_eq!(lowercase.bytes, b"\xED\xA0\x80");
|
||||
assert!(!lowercase.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_make_ascii_uppercase() {
|
||||
let mut uppercase = Wtf8Buf::from_str("");
|
||||
uppercase.make_ascii_uppercase();
|
||||
assert_eq!(uppercase.bytes, b"");
|
||||
|
||||
let mut uppercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇");
|
||||
uppercase.make_ascii_uppercase();
|
||||
assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87");
|
||||
|
||||
let mut uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() };
|
||||
uppercase.make_ascii_uppercase();
|
||||
assert_eq!(uppercase.bytes, b"\xED\xA0\x80");
|
||||
assert!(!uppercase.is_known_utf8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wtf8_to_owned() {
|
||||
let string = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() };
|
||||
assert_eq!(string.bytes, b"\xED\xA0\x80");
|
||||
assert!(!string.is_known_utf8);
|
||||
}
|
||||
|
@ -71,7 +71,7 @@ pub(crate) struct Context<'tcx> {
|
||||
}
|
||||
|
||||
// `Context` is cloned a lot, so we don't want the size to grow unexpectedly.
|
||||
#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
|
||||
#[cfg(all(not(windows), target_arch = "x86_64", target_pointer_width = "64"))]
|
||||
rustc_data_structures::static_assert_size!(Context<'_>, 128);
|
||||
|
||||
/// Shared mutable state used in [`Context`] and elsewhere.
|
||||
|
Loading…
Reference in New Issue
Block a user