Rollup merge of #137155 - thaliaarchi:wtf8-organize, r=ChrisDenton

Organize `OsString`/`OsStr` shims

Synchronize the `bytes.rs` and `wtf8.rs` shims for `OsString`/`OsStr` so they're easier to diff between each other. This is mostly ordering items the same between the two. I tried to minimize moves and went for the average locations between the files.

With them in the same order, it is clear that `FromInner<_>` is not implemented for `bytes::Buf` and `Clone::clone_from` is not implemented for `wtf8::Buf`, but they are for the other. Fix that.

I added #[inline] to all inherent methods of the `OsString`/`OsStr` shims, because it seemed that was already the rough pattern. `bytes.rs` has more inlining than `wtf8.rs`, so I added the corresponding ones to `wtf8.rs`. Then, the common missing ones have no discernible pattern to me. They're not divided by non-allocating/allocating. Perhaps the pattern is that UTF-8 validation isn't inlined? Since these types are merely the inner values in `OsStr`/`OsString`, I put inline on all methods and let those public types dictate inlining. I have not inspected codegen or run benchmarks.

Also, touch up some (private) documentation comments.

r? ``````@ChrisDenton``````
This commit is contained in:
Matthias Krüger 2025-02-19 21:16:12 +01:00 committed by GitHub
commit 3964bb131b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 162 additions and 117 deletions

View File

@ -8,7 +8,7 @@ use crate::collections::TryReserveError;
use crate::fmt::Write;
use crate::rc::Rc;
use crate::sync::Arc;
use crate::sys_common::{AsInner, IntoInner};
use crate::sys_common::{AsInner, FromInner, IntoInner};
use crate::{fmt, mem, str};
#[cfg(test)]
@ -25,6 +25,37 @@ pub struct Slice {
pub inner: [u8],
}
impl IntoInner<Vec<u8>> for Buf {
fn into_inner(self) -> Vec<u8> {
self.inner
}
}
impl FromInner<Vec<u8>> for Buf {
fn from_inner(inner: Vec<u8>) -> Self {
Buf { inner }
}
}
impl AsInner<[u8]> for Buf {
#[inline]
fn as_inner(&self) -> &[u8] {
&self.inner
}
}
impl fmt::Debug for Buf {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_slice(), f)
}
}
impl fmt::Display for Buf {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_slice(), f)
}
}
impl fmt::Debug for Slice {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
@ -55,18 +86,6 @@ impl fmt::Display for Slice {
}
}
impl fmt::Debug for Buf {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_slice(), formatter)
}
}
impl fmt::Display for Buf {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_slice(), formatter)
}
}
impl Clone for Buf {
#[inline]
fn clone(&self) -> Self {
@ -79,19 +98,6 @@ impl Clone for Buf {
}
}
impl IntoInner<Vec<u8>> for Buf {
fn into_inner(self) -> Vec<u8> {
self.inner
}
}
impl AsInner<[u8]> for Buf {
#[inline]
fn as_inner(&self) -> &[u8] {
&self.inner
}
}
impl Buf {
#[inline]
pub fn into_encoded_bytes(self) -> Vec<u8> {
@ -103,6 +109,12 @@ impl Buf {
Self { inner: s }
}
#[inline]
pub fn into_string(self) -> Result<String, Buf> {
String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
}
#[inline]
pub fn from_string(s: String) -> Buf {
Buf { inner: s.into_bytes() }
}
@ -122,6 +134,11 @@ impl Buf {
self.inner.capacity()
}
#[inline]
pub fn push_slice(&mut self, s: &Slice) {
self.inner.extend_from_slice(&s.inner)
}
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
@ -157,7 +174,7 @@ impl Buf {
// SAFETY: Slice just wraps [u8],
// and &*self.inner is &[u8], therefore
// transmuting &[u8] to &Slice is safe.
unsafe { mem::transmute(&*self.inner) }
unsafe { mem::transmute(self.inner.as_slice()) }
}
#[inline]
@ -165,15 +182,7 @@ impl Buf {
// SAFETY: Slice just wraps [u8],
// and &mut *self.inner is &mut [u8], therefore
// transmuting &mut [u8] to &mut Slice is safe.
unsafe { mem::transmute(&mut *self.inner) }
}
pub fn into_string(self) -> Result<String, Buf> {
String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
}
pub fn push_slice(&mut self, s: &Slice) {
self.inner.extend_from_slice(&s.inner)
unsafe { mem::transmute(self.inner.as_mut_slice()) }
}
#[inline]
@ -278,18 +287,22 @@ impl Slice {
unsafe { Slice::from_encoded_bytes_unchecked(s.as_bytes()) }
}
#[inline]
pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> {
str::from_utf8(&self.inner)
}
#[inline]
pub fn to_string_lossy(&self) -> Cow<'_, str> {
String::from_utf8_lossy(&self.inner)
}
#[inline]
pub fn to_owned(&self) -> Buf {
Buf { inner: self.inner.to_vec() }
}
#[inline]
pub fn clone_into(&self, buf: &mut Buf) {
self.inner.clone_into(&mut buf.inner)
}
@ -300,6 +313,7 @@ impl Slice {
unsafe { mem::transmute(boxed) }
}
#[inline]
pub fn empty_box() -> Box<Slice> {
let boxed: Box<[u8]> = Default::default();
unsafe { mem::transmute(boxed) }

View File

@ -10,11 +10,16 @@ use crate::sys_common::wtf8::{Wtf8, Wtf8Buf, check_utf8_boundary};
use crate::sys_common::{AsInner, FromInner, IntoInner};
use crate::{fmt, mem};
#[derive(Clone, Hash)]
#[derive(Hash)]
pub struct Buf {
pub inner: Wtf8Buf,
}
#[repr(transparent)]
pub struct Slice {
pub inner: Wtf8,
}
impl IntoInner<Wtf8Buf> for Buf {
fn into_inner(self) -> Wtf8Buf {
self.inner
@ -35,31 +40,38 @@ impl AsInner<Wtf8> for Buf {
}
impl fmt::Debug for Buf {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_slice(), formatter)
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(self.as_slice(), f)
}
}
impl fmt::Display for Buf {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_slice(), formatter)
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self.as_slice(), f)
}
}
#[repr(transparent)]
pub struct Slice {
pub inner: Wtf8,
}
impl fmt::Debug for Slice {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.inner, formatter)
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.inner, f)
}
}
impl fmt::Display for Slice {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.inner, formatter)
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.inner, f)
}
}
impl Clone for Buf {
#[inline]
fn clone(&self) -> Self {
Buf { inner: self.inner.clone() }
}
#[inline]
fn clone_from(&mut self, source: &Self) {
self.inner.clone_from(&source.inner)
}
}
@ -74,62 +86,57 @@ impl Buf {
unsafe { Self { inner: Wtf8Buf::from_bytes_unchecked(s) } }
}
pub fn with_capacity(capacity: usize) -> Buf {
Buf { inner: Wtf8Buf::with_capacity(capacity) }
}
pub fn clear(&mut self) {
self.inner.clear()
}
pub fn capacity(&self) -> usize {
self.inner.capacity()
}
pub fn from_string(s: String) -> Buf {
Buf { inner: Wtf8Buf::from_string(s) }
}
pub fn as_slice(&self) -> &Slice {
// SAFETY: Slice is just a wrapper for Wtf8,
// and self.inner.as_slice() returns &Wtf8.
// Therefore, transmuting &Wtf8 to &Slice is safe.
unsafe { mem::transmute(self.inner.as_slice()) }
}
pub fn as_mut_slice(&mut self) -> &mut Slice {
// SAFETY: Slice is just a wrapper for Wtf8,
// and self.inner.as_mut_slice() returns &mut Wtf8.
// Therefore, transmuting &mut Wtf8 to &mut Slice is safe.
// Additionally, care should be taken to ensure the slice
// is always valid Wtf8.
unsafe { mem::transmute(self.inner.as_mut_slice()) }
}
#[inline]
pub fn into_string(self) -> Result<String, Buf> {
self.inner.into_string().map_err(|buf| Buf { inner: buf })
}
#[inline]
pub fn from_string(s: String) -> Buf {
Buf { inner: Wtf8Buf::from_string(s) }
}
#[inline]
pub fn with_capacity(capacity: usize) -> Buf {
Buf { inner: Wtf8Buf::with_capacity(capacity) }
}
#[inline]
pub fn clear(&mut self) {
self.inner.clear()
}
#[inline]
pub fn capacity(&self) -> usize {
self.inner.capacity()
}
#[inline]
pub fn push_slice(&mut self, s: &Slice) {
self.inner.push_wtf8(&s.inner)
}
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
#[inline]
pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
self.inner.try_reserve(additional)
}
#[inline]
pub fn reserve_exact(&mut self, additional: usize) {
self.inner.reserve_exact(additional)
}
#[inline]
pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
self.inner.try_reserve_exact(additional)
}
#[inline]
pub fn shrink_to_fit(&mut self) {
self.inner.shrink_to_fit()
}
@ -139,6 +146,24 @@ impl Buf {
self.inner.shrink_to(min_capacity)
}
#[inline]
pub fn as_slice(&self) -> &Slice {
// SAFETY: Slice is just a wrapper for Wtf8,
// and self.inner.as_slice() returns &Wtf8.
// Therefore, transmuting &Wtf8 to &Slice is safe.
unsafe { mem::transmute(self.inner.as_slice()) }
}
#[inline]
pub fn as_mut_slice(&mut self) -> &mut Slice {
// SAFETY: Slice is just a wrapper for Wtf8,
// and self.inner.as_mut_slice() returns &mut Wtf8.
// Therefore, transmuting &mut Wtf8 to &mut Slice is safe.
// Additionally, care should be taken to ensure the slice
// is always valid Wtf8.
unsafe { mem::transmute(self.inner.as_mut_slice()) }
}
#[inline]
pub fn leak<'a>(self) -> &'a mut Slice {
unsafe { mem::transmute(self.inner.leak()) }
@ -194,6 +219,7 @@ impl Slice {
}
#[track_caller]
#[inline]
pub fn check_public_boundary(&self, index: usize) {
check_utf8_boundary(&self.inner, index);
}
@ -203,18 +229,22 @@ impl Slice {
unsafe { mem::transmute(Wtf8::from_str(s)) }
}
#[inline]
pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> {
self.inner.as_str()
}
#[inline]
pub fn to_string_lossy(&self) -> Cow<'_, str> {
self.inner.to_string_lossy()
}
#[inline]
pub fn to_owned(&self) -> Buf {
Buf { inner: self.inner.to_owned() }
}
#[inline]
pub fn clone_into(&self, buf: &mut Buf) {
self.inner.clone_into(&mut buf.inner)
}
@ -224,6 +254,7 @@ impl Slice {
unsafe { mem::transmute(self.inner.into_box()) }
}
#[inline]
pub fn empty_box() -> Box<Slice> {
unsafe { mem::transmute(Wtf8::empty_box()) }
}

View File

@ -156,9 +156,12 @@ impl ops::DerefMut for Wtf8Buf {
}
}
/// Format the string with double quotes,
/// and surrogates as `\u` followed by four hexadecimal digits.
/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
/// Formats the string in double quotes, with characters escaped according to
/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
/// where each `x` is a hexadecimal digit.
///
/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
/// `"a\u{D800}\n"`.
impl fmt::Debug for Wtf8Buf {
#[inline]
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
@ -181,7 +184,7 @@ impl Wtf8Buf {
/// Creates a WTF-8 string from a WTF-8 byte vec.
///
/// Since the byte vec is not checked for valid WTF-8, this functions is
/// Since the byte vec is not checked for valid WTF-8, this function is
/// marked unsafe.
#[inline]
pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
@ -205,7 +208,7 @@ impl Wtf8Buf {
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
#[inline]
pub fn from_str(s: &str) -> Wtf8Buf {
Wtf8Buf { bytes: <[_]>::to_vec(s.as_bytes()), is_known_utf8: true }
Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: true }
}
pub fn clear(&mut self) {
@ -237,8 +240,9 @@ impl Wtf8Buf {
string
}
/// Copied from String::push
/// Appends the given `char` to the end of this string.
/// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
/// Copied from String::push.
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
let mut bytes = [0; MAX_LEN_UTF8];
let bytes = encode_utf8_raw(code_point.value, &mut bytes);
@ -264,16 +268,16 @@ impl Wtf8Buf {
///
/// # Panics
///
/// Panics if the new capacity overflows `usize`.
/// Panics if the new capacity exceeds `isize::MAX` bytes.
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.bytes.reserve(additional)
}
/// Tries to reserve capacity for at least `additional` more length units
/// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid
/// frequent reallocations. After calling `try_reserve`, capacity will be
/// greater than or equal to `self.len() + additional`. Does nothing if
/// Tries to reserve capacity for at least `additional` more bytes to be
/// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
/// avoid frequent reallocations. After calling `try_reserve`, capacity will
/// be greater than or equal to `self.len() + additional`. Does nothing if
/// capacity is already sufficient. This method preserves the contents even
/// if an error occurs.
///
@ -291,8 +295,8 @@ impl Wtf8Buf {
self.bytes.reserve_exact(additional)
}
/// Tries to reserve the minimum capacity for exactly `additional`
/// length units in the given `Wtf8Buf`. After calling
/// Tries to reserve the minimum capacity for exactly `additional` more
/// bytes to be inserted in the given `Wtf8Buf`. After calling
/// `try_reserve_exact`, capacity will be greater than or equal to
/// `self.len() + additional` if it returns `Ok(())`.
/// Does nothing if the capacity is already sufficient.
@ -440,22 +444,17 @@ impl Wtf8Buf {
///
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “<>”)
pub fn into_string_lossy(mut self) -> String {
// Fast path: If we already have UTF-8, we can return it immediately.
if self.is_known_utf8 {
return unsafe { String::from_utf8_unchecked(self.bytes) };
}
let mut pos = 0;
loop {
match self.next_surrogate(pos) {
Some((surrogate_pos, _)) => {
pos = surrogate_pos + 3;
self.bytes[surrogate_pos..pos]
.copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
}
None => return unsafe { String::from_utf8_unchecked(self.bytes) },
if !self.is_known_utf8 {
let mut pos = 0;
while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
pos = surrogate_pos + 3;
// Surrogates and the replacement character are all 3 bytes, so
// they can substituted in-place.
self.bytes[surrogate_pos..pos]
.copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
}
}
unsafe { String::from_utf8_unchecked(self.bytes) }
}
/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
@ -535,9 +534,9 @@ impl AsInner<[u8]> for Wtf8 {
}
}
/// Format the slice with double quotes,
/// and surrogates as `\u` followed by four hexadecimal digits.
/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
/// Formats the string in double quotes, with characters escaped according to
/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
/// where each `x` is a hexadecimal digit.
impl fmt::Debug for Wtf8 {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
@ -562,6 +561,8 @@ impl fmt::Debug for Wtf8 {
}
}
/// Formats the string with unpaired surrogates substituted with the replacement
/// character, U+FFFD.
impl fmt::Display for Wtf8 {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
let wtf8_bytes = &self.bytes;
@ -672,9 +673,8 @@ impl Wtf8 {
///
/// This only copies the data if necessary (if it contains any surrogate).
pub fn to_string_lossy(&self) -> Cow<'_, str> {
let surrogate_pos = match self.next_surrogate(0) {
None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
Some((pos, _)) => pos,
let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
};
let wtf8_bytes = &self.bytes;
let mut utf8_bytes = Vec::with_capacity(self.len());
@ -964,7 +964,7 @@ pub struct Wtf8CodePoints<'a> {
bytes: slice::Iter<'a, u8>,
}
impl<'a> Iterator for Wtf8CodePoints<'a> {
impl Iterator for Wtf8CodePoints<'_> {
type Item = CodePoint;
#[inline]
@ -990,7 +990,7 @@ pub struct EncodeWide<'a> {
// Copied from libunicode/u_str.rs
#[stable(feature = "rust1", since = "1.0.0")]
impl<'a> Iterator for EncodeWide<'a> {
impl Iterator for EncodeWide<'_> {
type Item = u16;
#[inline]