Rollup merge of #119808 - GnomedDev:encode-charsearcher-size-in-type, r=Mark-Simulacrum

Store core::str::CharSearcher::utf8_size as u8 This is already relied on being smaller than u8 due to the `safety invariant: utf8_size must be less than 5`, so this helps LLVM optimize and maybe improve copies due to padding instead of unused bytes.
2024-11-02 07:22:42 +00:00 · 2024-02-19 13:04:32 +01:00 · 2024-02-19 13:04:32 +01:00 · c5da0382c8
commit c5da0382c8
parent eb1f279477 601f2d192e
1 changed files with 23 additions and 11 deletions
--- a/library/core/src/str/pattern.rs
+++ b/library/core/src/str/pattern.rs
@ -40,6 +40,7 @@

 use crate::cmp;
 use crate::cmp::Ordering;
+use crate::convert::TryInto as _;
 use crate::fmt;
 use crate::slice::memchr;

@ -370,11 +371,17 @@ pub struct CharSearcher<'a> {

    // safety invariant: `utf8_size` must be less than 5
    /// The number of bytes `needle` takes up when encoded in utf8.
-    utf8_size: usize,
+    utf8_size: u8,
    /// A utf8 encoded copy of the `needle`
    utf8_encoded: [u8; 4],
 }

+impl CharSearcher<'_> {
+    fn utf8_size(&self) -> usize {
+        self.utf8_size.into()
+    }
+}
+
 unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
    #[inline]
    fn haystack(&self) -> &'a str {
@ -414,7 +421,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
            let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
            // the last byte of the utf8 encoded needle
            // SAFETY: we have an invariant that `utf8_size < 5`
-            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
+            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
            if let Some(index) = memchr::memchr(last_byte, bytes) {
                // The new finger is the index of the byte we found,
                // plus one, since we memchr'd for the last byte of the character.
@ -434,10 +441,10 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
                // find something. When we find something the `finger` will be set
                // to a UTF8 boundary.
                self.finger += index + 1;
-                if self.finger >= self.utf8_size {
-                    let found_char = self.finger - self.utf8_size;
+                if self.finger >= self.utf8_size() {
+                    let found_char = self.finger - self.utf8_size();
                    if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
-                        if slice == &self.utf8_encoded[0..self.utf8_size] {
+                        if slice == &self.utf8_encoded[0..self.utf8_size()] {
                            return Some((found_char, self.finger));
                        }
                    }
@ -482,7 +489,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
            let bytes = haystack.get(self.finger..self.finger_back)?;
            // the last byte of the utf8 encoded needle
            // SAFETY: we have an invariant that `utf8_size < 5`
-            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
+            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
            if let Some(index) = memchr::memrchr(last_byte, bytes) {
                // we searched a slice that was offset by self.finger,
                // add self.finger to recoup the original index
@ -493,14 +500,14 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
                // char in the paradigm of reverse iteration). For
                // multibyte chars we need to skip down by the number of more
                // bytes they have than ASCII
-                let shift = self.utf8_size - 1;
+                let shift = self.utf8_size() - 1;
                if index >= shift {
                    let found_char = index - shift;
-                    if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
-                        if slice == &self.utf8_encoded[0..self.utf8_size] {
+                    if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size())) {
+                        if slice == &self.utf8_encoded[0..self.utf8_size()] {
                            // move finger to before the character found (i.e., at its start index)
                            self.finger_back = found_char;
-                            return Some((self.finger_back, self.finger_back + self.utf8_size));
+                            return Some((self.finger_back, self.finger_back + self.utf8_size()));
                        }
                    }
                }
@ -542,7 +549,12 @@ impl<'a> Pattern<'a> for char {
    #[inline]
    fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
        let mut utf8_encoded = [0; 4];
-        let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
+        let utf8_size = self
+            .encode_utf8(&mut utf8_encoded)
+            .len()
+            .try_into()
+            .expect("char len should be less than 255");
+
        CharSearcher {
            haystack,
            finger: 0,