mirror of
https://github.com/rust-lang/rust.git
synced 2024-10-31 22:41:50 +00:00
Add {floor,ceil}_char_boundary methods to str
This commit is contained in:
parent
c5e414843e
commit
edd318c313
@ -29,6 +29,7 @@
|
||||
#![feature(binary_heap_as_slice)]
|
||||
#![feature(inplace_iteration)]
|
||||
#![feature(iter_advance_by)]
|
||||
#![feature(round_char_boundary)]
|
||||
#![feature(slice_group_by)]
|
||||
#![feature(slice_partition_dedup)]
|
||||
#![feature(string_remove_matches)]
|
||||
|
@ -2272,3 +2272,95 @@ fn utf8_char_counts() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn floor_char_boundary() {
|
||||
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
|
||||
for idx in arg {
|
||||
assert_eq!(
|
||||
s.floor_char_boundary(idx),
|
||||
ret,
|
||||
"{:?}.floor_char_boundary({:?}) != {:?}",
|
||||
s,
|
||||
idx,
|
||||
ret
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// edge case
|
||||
check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0);
|
||||
|
||||
// basic check
|
||||
check_many("x", [0], 0);
|
||||
check_many("x", [1, isize::MAX as usize, usize::MAX], 1);
|
||||
|
||||
// 1-byte chars
|
||||
check_many("jp", [0], 0);
|
||||
check_many("jp", [1], 1);
|
||||
check_many("jp", 2..4, 2);
|
||||
|
||||
// 2-byte chars
|
||||
check_many("ĵƥ", 0..2, 0);
|
||||
check_many("ĵƥ", 2..4, 2);
|
||||
check_many("ĵƥ", 4..6, 4);
|
||||
|
||||
// 3-byte chars
|
||||
check_many("日本", 0..3, 0);
|
||||
check_many("日本", 3..6, 3);
|
||||
check_many("日本", 6..8, 6);
|
||||
|
||||
// 4-byte chars
|
||||
check_many("🇯🇵", 0..4, 0);
|
||||
check_many("🇯🇵", 4..8, 4);
|
||||
check_many("🇯🇵", 8..10, 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ceil_char_boundary() {
|
||||
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
|
||||
for idx in arg {
|
||||
assert_eq!(
|
||||
s.ceil_char_boundary(idx),
|
||||
ret,
|
||||
"{:?}.ceil_char_boundary({:?}) != {:?}",
|
||||
s,
|
||||
idx,
|
||||
ret
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// edge case
|
||||
check_many("", [0], 0);
|
||||
|
||||
// basic check
|
||||
check_many("x", [0], 0);
|
||||
check_many("x", [1], 1);
|
||||
|
||||
// 1-byte chars
|
||||
check_many("jp", [0], 0);
|
||||
check_many("jp", [1], 1);
|
||||
check_many("jp", [2], 2);
|
||||
|
||||
// 2-byte chars
|
||||
check_many("ĵƥ", 0..=0, 0);
|
||||
check_many("ĵƥ", 1..=2, 2);
|
||||
check_many("ĵƥ", 3..=4, 4);
|
||||
|
||||
// 3-byte chars
|
||||
check_many("日本", 0..=0, 0);
|
||||
check_many("日本", 1..=3, 3);
|
||||
check_many("日本", 4..=6, 6);
|
||||
|
||||
// 4-byte chars
|
||||
check_many("🇯🇵", 0..=0, 0);
|
||||
check_many("🇯🇵", 1..=4, 4);
|
||||
check_many("🇯🇵", 5..=8, 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn ceil_char_boundary_above_len_panic() {
|
||||
let _ = "x".ceil_char_boundary(2);
|
||||
}
|
||||
|
@ -809,6 +809,11 @@ impl u8 {
|
||||
pub fn escape_ascii(&self) -> ascii::EscapeDefault {
|
||||
ascii::escape_default(*self)
|
||||
}
|
||||
|
||||
pub(crate) fn is_utf8_char_boundary(self) -> bool {
|
||||
// This is bit magic equivalent to: b < 128 || b >= 192
|
||||
(self as i8) >= -0x40
|
||||
}
|
||||
}
|
||||
|
||||
#[lang = "u16"]
|
||||
|
@ -76,15 +76,14 @@ use iter::MatchIndicesInternal;
|
||||
use iter::SplitInternal;
|
||||
use iter::{MatchesInternal, SplitNInternal};
|
||||
|
||||
use validations::truncate_to_char_boundary;
|
||||
|
||||
#[inline(never)]
|
||||
#[cold]
|
||||
#[track_caller]
|
||||
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
|
||||
const MAX_DISPLAY_LENGTH: usize = 256;
|
||||
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
|
||||
let ellipsis = if truncated { "[...]" } else { "" };
|
||||
let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
|
||||
let s_trunc = &s[..trunc_len];
|
||||
let ellipsis = if trunc_len < s.len() { "[...]" } else { "" };
|
||||
|
||||
// 1. out of bounds
|
||||
if begin > s.len() || end > s.len() {
|
||||
@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
|
||||
// 3. character boundary
|
||||
let index = if !s.is_char_boundary(begin) { begin } else { end };
|
||||
// find the character
|
||||
let mut char_start = index;
|
||||
while !s.is_char_boundary(char_start) {
|
||||
char_start -= 1;
|
||||
}
|
||||
let char_start = s.floor_char_boundary(index);
|
||||
// `char_start` must be less than len and a char boundary
|
||||
let ch = s[char_start..].chars().next().unwrap();
|
||||
let char_range = char_start..char_start + ch.len_utf8();
|
||||
@ -215,8 +211,80 @@ impl str {
|
||||
// code on higher opt-levels. See PR #84751 for more details.
|
||||
None => index == self.len(),
|
||||
|
||||
// This is bit magic equivalent to: b < 128 || b >= 192
|
||||
Some(&b) => (b as i8) >= -0x40,
|
||||
Some(&b) => b.is_utf8_char_boundary(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
|
||||
///
|
||||
/// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
|
||||
/// exceed a given number of bytes. Note that this is done purely at the character level
|
||||
/// and can still visually split graphemes, even though the underlying characters aren't
|
||||
/// split. For example, the emoji 🧑🔬 (scientist) could be split so that the string only
|
||||
/// includes 🧑 (person) instead.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(round_char_boundary)]
|
||||
/// let s = "❤️🧡💛💚💙💜";
|
||||
/// assert_eq!(s.len(), 26);
|
||||
/// assert!(!s.is_char_boundary(13));
|
||||
///
|
||||
/// let closest = s.floor_char_boundary(13);
|
||||
/// assert_eq!(closest, 10);
|
||||
/// assert_eq!(&s[..closest], "❤️🧡");
|
||||
/// ```
|
||||
#[unstable(feature = "round_char_boundary", issue = "93743")]
|
||||
#[inline]
|
||||
pub fn floor_char_boundary(&self, index: usize) -> usize {
|
||||
if index >= self.len() {
|
||||
self.len()
|
||||
} else {
|
||||
let lower_bound = index.saturating_sub(3);
|
||||
let new_index = self.as_bytes()[lower_bound..=index]
|
||||
.iter()
|
||||
.rposition(|b| b.is_utf8_char_boundary());
|
||||
|
||||
// SAFETY: we know that the character boundary will be within four bytes
|
||||
unsafe { lower_bound + new_index.unwrap_unchecked() }
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`.
|
||||
///
|
||||
/// This method is the natural complement to [`floor_char_boundary`]. See that method
|
||||
/// for more details.
|
||||
///
|
||||
/// [`floor_char_boundary`]: str::floor_char_boundary
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if `index > self.len()`.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(round_char_boundary)]
|
||||
/// let s = "❤️🧡💛💚💙💜";
|
||||
/// assert_eq!(s.len(), 26);
|
||||
/// assert!(!s.is_char_boundary(13));
|
||||
///
|
||||
/// let closest = s.ceil_char_boundary(13);
|
||||
/// assert_eq!(closest, 14);
|
||||
/// assert_eq!(&s[..closest], "❤️🧡💛");
|
||||
/// ```
|
||||
#[unstable(feature = "round_char_boundary", issue = "93743")]
|
||||
#[inline]
|
||||
pub fn ceil_char_boundary(&self, index: usize) -> usize {
|
||||
if index > self.len() {
|
||||
slice_error_fail(self, index, index)
|
||||
} else {
|
||||
let upper_bound = Ord::min(index + 4, self.len());
|
||||
self.as_bytes()[index..upper_bound]
|
||||
.iter()
|
||||
.position(|b| b.is_utf8_char_boundary())
|
||||
.map_or(upper_bound, |pos| pos + index)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize {
|
||||
|
||||
/// Mask of the value bits of a continuation byte.
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
|
||||
// truncate `&str` to length at most equal to `max`
|
||||
// return `true` if it were truncated, and the new str.
|
||||
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
|
||||
if max >= s.len() {
|
||||
(false, s)
|
||||
} else {
|
||||
while !s.is_char_boundary(max) {
|
||||
max -= 1;
|
||||
}
|
||||
(true, &s[..max])
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user