From 729851ef9e746473a2e7cbe20564383433b5e376 Mon Sep 17 00:00:00 2001 From: Jan Verbeek Date: Wed, 29 Nov 2023 19:05:01 +0100 Subject: [PATCH] Add substring API for `OsStr` --- library/std/src/ffi/os_str.rs | 82 ++++++++++++++++++++++++++++- library/std/src/ffi/os_str/tests.rs | 50 ++++++++++++++++++ library/std/src/lib.rs | 1 + 3 files changed, 131 insertions(+), 2 deletions(-) diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index fa9d48771b6..81973182148 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -6,9 +6,10 @@ use crate::cmp; use crate::collections::TryReserveError; use crate::fmt; use crate::hash::{Hash, Hasher}; -use crate::ops; +use crate::ops::{self, Range}; use crate::rc::Rc; -use crate::str::FromStr; +use crate::slice; +use crate::str::{from_utf8 as str_from_utf8, FromStr}; use crate::sync::Arc; use crate::sys::os_str::{Buf, Slice}; @@ -963,6 +964,83 @@ impl OsStr { self.inner.as_encoded_bytes() } + /// Takes a substring based on a range that corresponds to the return value of + /// [`OsStr::as_encoded_bytes`]. + /// + /// The range's start and end must lie on valid `OsStr` boundaries. + /// A valid `OsStr` boundary is one of: + /// - The start of the string + /// - The end of the string + /// - Immediately before a valid non-empty UTF-8 substring + /// - Immediately after a valid non-empty UTF-8 substring + /// + /// # Panics + /// + /// Panics if `range` does not lie on valid `OsStr` boundaries or if it + /// exceeds the end of the string. + /// + /// # Example + /// + /// ``` + /// #![feature(os_str_slice)] + /// + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo=bar"); + /// let bytes = os_str.as_encoded_bytes(); + /// if let Some(index) = bytes.iter().position(|b| *b == b'=') { + /// let key = os_str.slice_encoded_bytes(..index); + /// let value = os_str.slice_encoded_bytes(index + 1..); + /// assert_eq!(key, "foo"); + /// assert_eq!(value, "bar"); + /// } + /// ``` + #[unstable(feature = "os_str_slice", issue = "118485")] + pub fn slice_encoded_bytes>(&self, range: R) -> &Self { + #[track_caller] + fn check_valid_boundary(bytes: &[u8], index: usize) { + if index == 0 || index == bytes.len() { + return; + } + + // Fast path + if bytes[index - 1].is_ascii() || bytes[index].is_ascii() { + return; + } + + let (before, after) = bytes.split_at(index); + + // UTF-8 takes at most 4 bytes per codepoint, so we don't + // need to check more than that. + let after = after.get(..4).unwrap_or(after); + match str_from_utf8(after) { + Ok(_) => return, + Err(err) if err.valid_up_to() != 0 => return, + Err(_) => (), + } + + for len in 2..=4.min(index) { + let before = &before[index - len..]; + if str_from_utf8(before).is_ok() { + return; + } + } + + panic!("byte index {index} is not an OsStr boundary"); + } + + let encoded_bytes = self.as_encoded_bytes(); + let Range { start, end } = slice::range(range, ..encoded_bytes.len()); + check_valid_boundary(encoded_bytes, start); + check_valid_boundary(encoded_bytes, end); + + // SAFETY: `slice::range` ensures that `start` and `end` are valid + let slice = unsafe { encoded_bytes.get_unchecked(start..end) }; + + // SAFETY: `slice` comes from `self` and we validated the boundaries + unsafe { Self::from_encoded_bytes_unchecked(slice) } + } + /// Converts this string to its ASCII lower case equivalent in-place. /// /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', diff --git a/library/std/src/ffi/os_str/tests.rs b/library/std/src/ffi/os_str/tests.rs index d7926749aae..2765398d3e6 100644 --- a/library/std/src/ffi/os_str/tests.rs +++ b/library/std/src/ffi/os_str/tests.rs @@ -177,3 +177,53 @@ fn into_rc() { assert_eq!(&*rc2, os_str); assert_eq!(&*arc2, os_str); } + +#[test] +fn slice_encoded_bytes() { + let os_str = OsStr::new("123ΞΈαƒ’πŸ¦€"); + // ASCII + let digits = os_str.slice_encoded_bytes(..3); + assert_eq!(digits, "123"); + let three = os_str.slice_encoded_bytes(2..3); + assert_eq!(three, "3"); + // 2-byte UTF-8 + let theta = os_str.slice_encoded_bytes(3..5); + assert_eq!(theta, "ΞΈ"); + // 3-byte UTF-8 + let gani = os_str.slice_encoded_bytes(5..8); + assert_eq!(gani, "αƒ’"); + // 4-byte UTF-8 + let crab = os_str.slice_encoded_bytes(8..); + assert_eq!(crab, "πŸ¦€"); +} + +#[test] +#[should_panic(expected = "byte index 2 is not an OsStr boundary")] +fn slice_mid_char() { + let crab = OsStr::new("πŸ¦€"); + let _ = crab.slice_encoded_bytes(..2); +} + +#[cfg(windows)] +#[test] +#[should_panic(expected = "byte index 3 is not an OsStr boundary")] +fn slice_between_surrogates() { + use crate::os::windows::ffi::OsStringExt; + + let os_string = OsString::from_wide(&[0xD800, 0xD800]); + assert_eq!(os_string.as_encoded_bytes(), &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80]); + let _ = os_string.slice_encoded_bytes(..3); +} + +#[cfg(windows)] +#[test] +fn slice_surrogate_edge() { + use crate::os::windows::ffi::OsStringExt; + + let os_string = OsString::from_wide(&[0xD800]); + let mut with_crab = os_string.clone(); + with_crab.push("πŸ¦€"); + + assert_eq!(with_crab.slice_encoded_bytes(..3), os_string); + assert_eq!(with_crab.slice_encoded_bytes(3..), "πŸ¦€"); +} diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 8dc5b07ce10..d4d21b2e981 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -341,6 +341,7 @@ #![feature(round_ties_even)] #![feature(slice_internals)] #![feature(slice_ptr_get)] +#![feature(slice_range)] #![feature(std_internals)] #![feature(str_internals)] #![feature(strict_provenance)]