Rollup merge of #119808 - GnomedDev:encode-charsearcher-size-in-type, r=Mark-Simulacrum

Store core::str::CharSearcher::utf8_size as u8

This is already relied on being smaller than u8 due to the `safety invariant: utf8_size must be less than 5`, so this helps LLVM optimize and maybe improve copies due to padding instead of unused bytes.
This commit is contained in:
Matthias Krüger 2024-02-19 13:04:32 +01:00 committed by GitHub
commit c5da0382c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -40,6 +40,7 @@
use crate::cmp;
use crate::cmp::Ordering;
use crate::convert::TryInto as _;
use crate::fmt;
use crate::slice::memchr;
@ -370,11 +371,17 @@ pub struct CharSearcher<'a> {
// safety invariant: `utf8_size` must be less than 5
/// The number of bytes `needle` takes up when encoded in utf8.
utf8_size: usize,
utf8_size: u8,
/// A utf8 encoded copy of the `needle`
utf8_encoded: [u8; 4],
}
impl CharSearcher<'_> {
fn utf8_size(&self) -> usize {
self.utf8_size.into()
}
}
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
#[inline]
fn haystack(&self) -> &'a str {
@ -414,7 +421,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
if let Some(index) = memchr::memchr(last_byte, bytes) {
// The new finger is the index of the byte we found,
// plus one, since we memchr'd for the last byte of the character.
@ -434,10 +441,10 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
// find something. When we find something the `finger` will be set
// to a UTF8 boundary.
self.finger += index + 1;
if self.finger >= self.utf8_size {
let found_char = self.finger - self.utf8_size;
if self.finger >= self.utf8_size() {
let found_char = self.finger - self.utf8_size();
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if slice == &self.utf8_encoded[0..self.utf8_size()] {
return Some((found_char, self.finger));
}
}
@ -482,7 +489,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
let bytes = haystack.get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
if let Some(index) = memchr::memrchr(last_byte, bytes) {
// we searched a slice that was offset by self.finger,
// add self.finger to recoup the original index
@ -493,14 +500,14 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
// char in the paradigm of reverse iteration). For
// multibyte chars we need to skip down by the number of more
// bytes they have than ASCII
let shift = self.utf8_size - 1;
let shift = self.utf8_size() - 1;
if index >= shift {
let found_char = index - shift;
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size())) {
if slice == &self.utf8_encoded[0..self.utf8_size()] {
// move finger to before the character found (i.e., at its start index)
self.finger_back = found_char;
return Some((self.finger_back, self.finger_back + self.utf8_size));
return Some((self.finger_back, self.finger_back + self.utf8_size()));
}
}
}
@ -542,7 +549,12 @@ impl<'a> Pattern<'a> for char {
#[inline]
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
let mut utf8_encoded = [0; 4];
let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
let utf8_size = self
.encode_utf8(&mut utf8_encoded)
.len()
.try_into()
.expect("char len should be less than 255");
CharSearcher {
haystack,
finger: 0,