Add {floor,ceil}_char_boundary methods to str

This commit is contained in:
ltdk 2021-06-20 16:24:10 -04:00
parent c5e414843e
commit edd318c313
5 changed files with 176 additions and 23 deletions

View File

@ -29,6 +29,7 @@
#![feature(binary_heap_as_slice)]
#![feature(inplace_iteration)]
#![feature(iter_advance_by)]
#![feature(round_char_boundary)]
#![feature(slice_group_by)]
#![feature(slice_partition_dedup)]
#![feature(string_remove_matches)]

View File

@ -2272,3 +2272,95 @@ fn utf8_char_counts() {
}
}
}
#[test]
fn floor_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.floor_char_boundary(idx),
ret,
"{:?}.floor_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}
// edge case
check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0);
// basic check
check_many("x", [0], 0);
check_many("x", [1, isize::MAX as usize, usize::MAX], 1);
// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", 2..4, 2);
// 2-byte chars
check_many("ĵƥ", 0..2, 0);
check_many("ĵƥ", 2..4, 2);
check_many("ĵƥ", 4..6, 4);
// 3-byte chars
check_many("日本", 0..3, 0);
check_many("日本", 3..6, 3);
check_many("日本", 6..8, 6);
// 4-byte chars
check_many("🇯🇵", 0..4, 0);
check_many("🇯🇵", 4..8, 4);
check_many("🇯🇵", 8..10, 8);
}
#[test]
fn ceil_char_boundary() {
fn check_many(s: &str, arg: impl IntoIterator<Item = usize>, ret: usize) {
for idx in arg {
assert_eq!(
s.ceil_char_boundary(idx),
ret,
"{:?}.ceil_char_boundary({:?}) != {:?}",
s,
idx,
ret
);
}
}
// edge case
check_many("", [0], 0);
// basic check
check_many("x", [0], 0);
check_many("x", [1], 1);
// 1-byte chars
check_many("jp", [0], 0);
check_many("jp", [1], 1);
check_many("jp", [2], 2);
// 2-byte chars
check_many("ĵƥ", 0..=0, 0);
check_many("ĵƥ", 1..=2, 2);
check_many("ĵƥ", 3..=4, 4);
// 3-byte chars
check_many("日本", 0..=0, 0);
check_many("日本", 1..=3, 3);
check_many("日本", 4..=6, 6);
// 4-byte chars
check_many("🇯🇵", 0..=0, 0);
check_many("🇯🇵", 1..=4, 4);
check_many("🇯🇵", 5..=8, 8);
}
#[test]
#[should_panic]
fn ceil_char_boundary_above_len_panic() {
let _ = "x".ceil_char_boundary(2);
}

View File

@ -809,6 +809,11 @@ impl u8 {
pub fn escape_ascii(&self) -> ascii::EscapeDefault {
ascii::escape_default(*self)
}
pub(crate) fn is_utf8_char_boundary(self) -> bool {
// This is bit magic equivalent to: b < 128 || b >= 192
(self as i8) >= -0x40
}
}
#[lang = "u16"]

View File

@ -76,15 +76,14 @@ use iter::MatchIndicesInternal;
use iter::SplitInternal;
use iter::{MatchesInternal, SplitNInternal};
use validations::truncate_to_char_boundary;
#[inline(never)]
#[cold]
#[track_caller]
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
const MAX_DISPLAY_LENGTH: usize = 256;
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
let ellipsis = if truncated { "[...]" } else { "" };
let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH);
let s_trunc = &s[..trunc_len];
let ellipsis = if trunc_len < s.len() { "[...]" } else { "" };
// 1. out of bounds
if begin > s.len() || end > s.len() {
@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
// 3. character boundary
let index = if !s.is_char_boundary(begin) { begin } else { end };
// find the character
let mut char_start = index;
while !s.is_char_boundary(char_start) {
char_start -= 1;
}
let char_start = s.floor_char_boundary(index);
// `char_start` must be less than len and a char boundary
let ch = s[char_start..].chars().next().unwrap();
let char_range = char_start..char_start + ch.len_utf8();
@ -215,8 +211,80 @@ impl str {
// code on higher opt-levels. See PR #84751 for more details.
None => index == self.len(),
// This is bit magic equivalent to: b < 128 || b >= 192
Some(&b) => (b as i8) >= -0x40,
Some(&b) => b.is_utf8_char_boundary(),
}
}
/// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`.
///
/// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
/// exceed a given number of bytes. Note that this is done purely at the character level
/// and can still visually split graphemes, even though the underlying characters aren't
/// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only
/// includes 🧑 (person) instead.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.floor_char_boundary(13);
/// assert_eq!(closest, 10);
/// assert_eq!(&s[..closest], "❤️🧡");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn floor_char_boundary(&self, index: usize) -> usize {
if index >= self.len() {
self.len()
} else {
let lower_bound = index.saturating_sub(3);
let new_index = self.as_bytes()[lower_bound..=index]
.iter()
.rposition(|b| b.is_utf8_char_boundary());
// SAFETY: we know that the character boundary will be within four bytes
unsafe { lower_bound + new_index.unwrap_unchecked() }
}
}
/// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`.
///
/// This method is the natural complement to [`floor_char_boundary`]. See that method
/// for more details.
///
/// [`floor_char_boundary`]: str::floor_char_boundary
///
/// # Panics
///
/// Panics if `index > self.len()`.
///
/// # Examples
///
/// ```
/// #![feature(round_char_boundary)]
/// let s = "❤️🧡💛💚💙💜";
/// assert_eq!(s.len(), 26);
/// assert!(!s.is_char_boundary(13));
///
/// let closest = s.ceil_char_boundary(13);
/// assert_eq!(closest, 14);
/// assert_eq!(&s[..closest], "❤️🧡💛");
/// ```
#[unstable(feature = "round_char_boundary", issue = "93743")]
#[inline]
pub fn ceil_char_boundary(&self, index: usize) -> usize {
if index > self.len() {
slice_error_fail(self, index, index)
} else {
let upper_bound = Ord::min(index + 4, self.len());
self.as_bytes()[index..upper_bound]
.iter()
.position(|b| b.is_utf8_char_boundary())
.map_or(upper_bound, |pos| pos + index)
}
}

View File

@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize {
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
// truncate `&str` to length at most equal to `max`
// return `true` if it were truncated, and the new str.
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}