From 90c813a0f0b5042a2bbf2d9ebf27f21acdbc9f77 Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 09:06:21 +0000 Subject: [PATCH] Move utf-8 validating helpers to new mod --- library/core/src/str/iter.rs | 2 +- library/core/src/str/lossy.rs | 10 +- library/core/src/str/mod.rs | 280 +--------------------------- library/core/src/str/validations.rs | 275 +++++++++++++++++++++++++++ 4 files changed, 288 insertions(+), 279 deletions(-) create mode 100644 library/core/src/str/validations.rs diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 993df96a2d1..27a67e2b22f 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -12,8 +12,8 @@ use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; use super::pattern::Pattern; use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; +use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; use super::LinesAnyMap; -use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace}; diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 88b2bc551b7..720a35bbc8f 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -1,7 +1,9 @@ use crate::char; use crate::fmt::{self, Write}; use crate::mem; -use crate::str as core_str; + +use super::from_utf8_unchecked; +use super::validations::utf8_char_width; /// Lossy UTF-8 string. #[unstable(feature = "str_internals", issue = "none")] @@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { if byte < 128 { } else { - let w = core_str::utf8_char_width(byte); + let w = utf8_char_width(byte); macro_rules! error { () => {{ // SAFETY: We have checked up to `i` that source is valid UTF-8. unsafe { let r = Utf8LossyChunk { - valid: core_str::from_utf8_unchecked(&self.source[0..i_]), + valid: from_utf8_unchecked(&self.source[0..i_]), broken: &self.source[i_..i], }; self.source = &self.source[i..]; @@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { let r = Utf8LossyChunk { // SAFETY: We have checked that the entire source is valid UTF-8. - valid: unsafe { core_str::from_utf8_unchecked(self.source) }, + valid: unsafe { from_utf8_unchecked(self.source) }, broken: &[], }; self.source = &[]; diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 02b85ebfe49..ab9bec2fd2d 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -11,6 +11,7 @@ mod error; mod iter; mod traits; +mod validations; use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; @@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace; #[unstable(feature = "split_inclusive", issue = "72360")] use iter::SplitInclusive; +#[unstable(feature = "str_internals", issue = "none")] +pub use validations::next_code_point; + use iter::MatchIndicesInternal; use iter::SplitInternal; use iter::{MatchesInternal, SplitNInternal}; +use validations::{run_utf8_validation, truncate_to_char_boundary}; + /* Section: Creating a string */ @@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { unsafe { &mut *(v as *mut [u8] as *mut str) } } -/// Returns the initial codepoint accumulator for the first byte. -/// The first byte is special, only want bottom 5 bits for width 2, 4 bits -/// for width 3, and 3 bits for width 4. -#[inline] -fn utf8_first_byte(byte: u8, width: u32) -> u32 { - (byte & (0x7F >> width)) as u32 -} - -/// Returns the value of `ch` updated with continuation byte `byte`. -#[inline] -fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { - (ch << 6) | (byte & CONT_MASK) as u32 -} - -/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the -/// bits `10`). -#[inline] -fn utf8_is_cont_byte(byte: u8) -> bool { - (byte & !CONT_MASK) == TAG_CONT_U8 -} - -#[inline] -fn unwrap_or_0(opt: Option<&u8>) -> u8 { - match opt { - Some(&byte) => byte, - None => 0, - } -} - -/// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). -#[unstable(feature = "str_internals", issue = "none")] -#[inline] -pub fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { - // Decode UTF-8 - let x = *bytes.next()?; - if x < 128 { - return Some(x as u32); - } - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte(x, 2); - let y = unwrap_or_0(bytes.next()); - let mut ch = utf8_acc_cont_byte(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = unwrap_or_0(bytes.next()); - let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = unwrap_or_0(bytes.next()); - ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); - } - } - - Some(ch) -} - -/// Reads the last code point out of a byte iterator (assuming a -/// UTF-8-like encoding). -#[inline] -fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option -where - I: DoubleEndedIterator, -{ - // Decode UTF-8 - let w = match *bytes.next_back()? { - next_byte if next_byte < 128 => return Some(next_byte as u32), - back_byte => back_byte, - }; - - // Multibyte case follows - // Decode from a byte combination out of: [x [y [z w]]] - let mut ch; - let z = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(z, 2); - if utf8_is_cont_byte(z) { - let y = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(y, 3); - if utf8_is_cont_byte(y) { - let x = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(x, 4); - ch = utf8_acc_cont_byte(ch, y); - } - ch = utf8_acc_cont_byte(ch, z); - } - ch = utf8_acc_cont_byte(ch, w); - - Some(ch) -} - impl_fn_for_zst! { /// A nameable, cloneable fn type #[derive(Clone)] @@ -363,184 +273,6 @@ impl_fn_for_zst! { }; } -/* -Section: UTF-8 validation -*/ - -// use truncation to fit u64 into usize -const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; - -/// Returns `true` if any byte in the word `x` is nonascii (>= 128). -#[inline] -fn contains_nonascii(x: usize) -> bool { - (x & NONASCII_MASK) != 0 -} - -/// Walks through `v` checking that it's a valid UTF-8 sequence, -/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. -#[inline(always)] -fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { - let mut index = 0; - let len = v.len(); - - let usize_bytes = mem::size_of::(); - let ascii_block_size = 2 * usize_bytes; - let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; - let align = v.as_ptr().align_offset(usize_bytes); - - while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }); - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - - let first = v[index]; - if first >= 128 { - let w = UTF8_CHAR_WIDTH[first as usize]; - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(2)) - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(3)) - } - } - _ => err!(Some(1)), - } - index += 1; - } else { - // Ascii case, try to skip forward quickly. - // When the pointer is aligned, read 2 words of data per iteration - // until we find a word containing a non-ascii byte. - if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { - let ptr = v.as_ptr(); - while index < blocks_end { - // SAFETY: since `align - index` and `ascii_block_size` are - // multiples of `usize_bytes`, `block = ptr.add(index)` is - // always aligned with a `usize` so it's safe to dereference - // both `block` and `block.offset(1)`. - unsafe { - let block = ptr.add(index) as *const usize; - // break if there is a nonascii byte - let zu = contains_nonascii(*block); - let zv = contains_nonascii(*block.offset(1)); - if zu | zv { - break; - } - } - index += ascii_block_size; - } - // step from the point where the wordwise loop stopped - while index < len && v[index] < 128 { - index += 1; - } - } else { - index += 1; - } - } - } - - Ok(()) -} - -// https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8; 256] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x7F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, // 0x9F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, // 0xBF - 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, // 0xDF - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF - 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF -]; - -/// Given a first byte, determines how many bytes are in this UTF-8 character. -#[unstable(feature = "str_internals", issue = "none")] -#[inline] -pub fn utf8_char_width(b: u8) -> usize { - UTF8_CHAR_WIDTH[b as usize] as usize -} - -/// Mask of the value bits of a continuation byte. -const CONT_MASK: u8 = 0b0011_1111; -/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. -const TAG_CONT_U8: u8 = 0b1000_0000; - -// truncate `&str` to length at most equal to `max` -// return `true` if it were truncated, and the new str. -fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { - if max >= s.len() { - (false, s) - } else { - while !s.is_char_boundary(max) { - max -= 1; - } - (true, &s[..max]) - } -} - #[inline(never)] #[cold] #[track_caller] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs new file mode 100644 index 00000000000..10cf1e172e6 --- /dev/null +++ b/library/core/src/str/validations.rs @@ -0,0 +1,275 @@ +//! Operations related to UTF-8 validation. + +use crate::mem; + +use super::Utf8Error; + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +pub(super) fn utf8_is_cont_byte(byte: u8) -> bool { + (byte & !CONT_MASK) == TAG_CONT_U8 +} + +#[inline] +fn unwrap_or_0(opt: Option<&u8>) -> u8 { + match opt { + Some(&byte) => byte, + None => 0, + } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // Decode UTF-8 + let x = *bytes.next()?; + if x < 128 { + return Some(x as u32); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = unwrap_or_0(bytes.next()); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(bytes.next()); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(bytes.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[inline] +pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +where + I: DoubleEndedIterator, +{ + // Decode UTF-8 + let w = match *bytes.next_back()? { + next_byte if next_byte < 128 => return Some(next_byte as u32), + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + let z = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + let y = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + let x = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + Some(ch) +} + +// use truncation to fit u64 into usize +const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; + +/// Returns `true` if any byte in the word `x` is nonascii (>= 128). +#[inline] +fn contains_nonascii(x: usize) -> bool { + (x & NONASCII_MASK) != 0 +} + +/// Walks through `v` checking that it's a valid UTF-8 sequence, +/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. +#[inline(always)] +pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut index = 0; + let len = v.len(); + + let usize_bytes = mem::size_of::(); + let ascii_block_size = 2 * usize_bytes; + let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; + let align = v.as_ptr().align_offset(usize_bytes); + + while index < len { + let old_offset = index; + macro_rules! err { + ($error_len: expr) => { + return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }); + }; + } + + macro_rules! next { + () => {{ + index += 1; + // we needed data, but there was none: error! + if index >= len { + err!(None) + } + v[index] + }}; + } + + let first = v[index]; + if first >= 128 { + let w = UTF8_CHAR_WIDTH[first as usize]; + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => { + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(1)) + } + } + 3 => { + match (first, next!()) { + (0xE0, 0xA0..=0xBF) + | (0xE1..=0xEC, 0x80..=0xBF) + | (0xED, 0x80..=0x9F) + | (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => err!(Some(1)), + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) + } + } + 4 => { + match (first, next!()) { + (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} + _ => err!(Some(1)), + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(3)) + } + } + _ => err!(Some(1)), + } + index += 1; + } else { + // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { + let ptr = v.as_ptr(); + while index < blocks_end { + // SAFETY: since `align - index` and `ascii_block_size` are + // multiples of `usize_bytes`, `block = ptr.add(index)` is + // always aligned with a `usize` so it's safe to dereference + // both `block` and `block.offset(1)`. + unsafe { + let block = ptr.add(index) as *const usize; + // break if there is a nonascii byte + let zu = contains_nonascii(*block); + let zv = contains_nonascii(*block.offset(1)); + if zu | zv { + break; + } + } + index += ascii_block_size; + } + // step from the point where the wordwise loop stopped + while index < len && v[index] < 128 { + index += 1; + } + } else { + index += 1; + } + } + } + + Ok(()) +} + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0xBF + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, // 0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF +]; + +/// Given a first byte, determines how many bytes are in this UTF-8 character. +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + UTF8_CHAR_WIDTH[b as usize] as usize +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; +/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. +const TAG_CONT_U8: u8 = 0b1000_0000; + +// truncate `&str` to length at most equal to `max` +// return `true` if it were truncated, and the new str. +pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { + if max >= s.len() { + (false, s) + } else { + while !s.is_char_boundary(max) { + max -= 1; + } + (true, &s[..max]) + } +}