mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-23 23:34:48 +00:00
Move utf-8 validating helpers to new mod
This commit is contained in:
parent
5f0d724e29
commit
90c813a0f0
@ -12,8 +12,8 @@ use crate::slice::{self, Split as SliceSplit};
|
||||
use super::from_utf8_unchecked;
|
||||
use super::pattern::Pattern;
|
||||
use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
|
||||
use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
|
||||
use super::LinesAnyMap;
|
||||
use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
|
||||
use super::{BytesIsNotEmpty, UnsafeBytesToStr};
|
||||
use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
|
||||
use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace};
|
||||
|
@ -1,7 +1,9 @@
|
||||
use crate::char;
|
||||
use crate::fmt::{self, Write};
|
||||
use crate::mem;
|
||||
use crate::str as core_str;
|
||||
|
||||
use super::from_utf8_unchecked;
|
||||
use super::validations::utf8_char_width;
|
||||
|
||||
/// Lossy UTF-8 string.
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
|
||||
|
||||
if byte < 128 {
|
||||
} else {
|
||||
let w = core_str::utf8_char_width(byte);
|
||||
let w = utf8_char_width(byte);
|
||||
|
||||
macro_rules! error {
|
||||
() => {{
|
||||
// SAFETY: We have checked up to `i` that source is valid UTF-8.
|
||||
unsafe {
|
||||
let r = Utf8LossyChunk {
|
||||
valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
|
||||
valid: from_utf8_unchecked(&self.source[0..i_]),
|
||||
broken: &self.source[i_..i],
|
||||
};
|
||||
self.source = &self.source[i..];
|
||||
@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
|
||||
|
||||
let r = Utf8LossyChunk {
|
||||
// SAFETY: We have checked that the entire source is valid UTF-8.
|
||||
valid: unsafe { core_str::from_utf8_unchecked(self.source) },
|
||||
valid: unsafe { from_utf8_unchecked(self.source) },
|
||||
broken: &[],
|
||||
};
|
||||
self.source = &[];
|
||||
|
@ -11,6 +11,7 @@
|
||||
mod error;
|
||||
mod iter;
|
||||
mod traits;
|
||||
mod validations;
|
||||
|
||||
use self::pattern::Pattern;
|
||||
use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
|
||||
@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace;
|
||||
#[unstable(feature = "split_inclusive", issue = "72360")]
|
||||
use iter::SplitInclusive;
|
||||
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
pub use validations::next_code_point;
|
||||
|
||||
use iter::MatchIndicesInternal;
|
||||
use iter::SplitInternal;
|
||||
use iter::{MatchesInternal, SplitNInternal};
|
||||
|
||||
use validations::{run_utf8_validation, truncate_to_char_boundary};
|
||||
|
||||
/*
|
||||
Section: Creating a string
|
||||
*/
|
||||
@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
|
||||
unsafe { &mut *(v as *mut [u8] as *mut str) }
|
||||
}
|
||||
|
||||
/// Returns the initial codepoint accumulator for the first byte.
|
||||
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||
/// for width 3, and 3 bits for width 4.
|
||||
#[inline]
|
||||
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
|
||||
(byte & (0x7F >> width)) as u32
|
||||
}
|
||||
|
||||
/// Returns the value of `ch` updated with continuation byte `byte`.
|
||||
#[inline]
|
||||
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
|
||||
(ch << 6) | (byte & CONT_MASK) as u32
|
||||
}
|
||||
|
||||
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
|
||||
/// bits `10`).
|
||||
#[inline]
|
||||
fn utf8_is_cont_byte(byte: u8) -> bool {
|
||||
(byte & !CONT_MASK) == TAG_CONT_U8
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
|
||||
match opt {
|
||||
Some(&byte) => byte,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the next code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
#[inline]
|
||||
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
|
||||
// Decode UTF-8
|
||||
let x = *bytes.next()?;
|
||||
if x < 128 {
|
||||
return Some(x as u32);
|
||||
}
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [[[x y] z] w]
|
||||
// NOTE: Performance is sensitive to the exact formulation here
|
||||
let init = utf8_first_byte(x, 2);
|
||||
let y = unwrap_or_0(bytes.next());
|
||||
let mut ch = utf8_acc_cont_byte(init, y);
|
||||
if x >= 0xE0 {
|
||||
// [[x y z] w] case
|
||||
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
|
||||
let z = unwrap_or_0(bytes.next());
|
||||
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
|
||||
ch = init << 12 | y_z;
|
||||
if x >= 0xF0 {
|
||||
// [x y z w] case
|
||||
// use only the lower 3 bits of `init`
|
||||
let w = unwrap_or_0(bytes.next());
|
||||
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
|
||||
}
|
||||
}
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
/// Reads the last code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
#[inline]
|
||||
fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
|
||||
where
|
||||
I: DoubleEndedIterator<Item = &'a u8>,
|
||||
{
|
||||
// Decode UTF-8
|
||||
let w = match *bytes.next_back()? {
|
||||
next_byte if next_byte < 128 => return Some(next_byte as u32),
|
||||
back_byte => back_byte,
|
||||
};
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [x [y [z w]]]
|
||||
let mut ch;
|
||||
let z = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(z, 2);
|
||||
if utf8_is_cont_byte(z) {
|
||||
let y = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(y, 3);
|
||||
if utf8_is_cont_byte(y) {
|
||||
let x = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(x, 4);
|
||||
ch = utf8_acc_cont_byte(ch, y);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, z);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, w);
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
impl_fn_for_zst! {
|
||||
/// A nameable, cloneable fn type
|
||||
#[derive(Clone)]
|
||||
@ -363,184 +273,6 @@ impl_fn_for_zst! {
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
Section: UTF-8 validation
|
||||
*/
|
||||
|
||||
// use truncation to fit u64 into usize
|
||||
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
|
||||
|
||||
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
|
||||
#[inline]
|
||||
fn contains_nonascii(x: usize) -> bool {
|
||||
(x & NONASCII_MASK) != 0
|
||||
}
|
||||
|
||||
/// Walks through `v` checking that it's a valid UTF-8 sequence,
|
||||
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
|
||||
#[inline(always)]
|
||||
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
let mut index = 0;
|
||||
let len = v.len();
|
||||
|
||||
let usize_bytes = mem::size_of::<usize>();
|
||||
let ascii_block_size = 2 * usize_bytes;
|
||||
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
|
||||
let align = v.as_ptr().align_offset(usize_bytes);
|
||||
|
||||
while index < len {
|
||||
let old_offset = index;
|
||||
macro_rules! err {
|
||||
($error_len: expr) => {
|
||||
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! next {
|
||||
() => {{
|
||||
index += 1;
|
||||
// we needed data, but there was none: error!
|
||||
if index >= len {
|
||||
err!(None)
|
||||
}
|
||||
v[index]
|
||||
}};
|
||||
}
|
||||
|
||||
let first = v[index];
|
||||
if first >= 128 {
|
||||
let w = UTF8_CHAR_WIDTH[first as usize];
|
||||
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// excluding surrogates codepoints \u{d800} to \u{dfff}
|
||||
// ED A0 80 to ED BF BF
|
||||
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
match w {
|
||||
2 => {
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(1))
|
||||
}
|
||||
}
|
||||
3 => {
|
||||
match (first, next!()) {
|
||||
(0xE0, 0xA0..=0xBF)
|
||||
| (0xE1..=0xEC, 0x80..=0xBF)
|
||||
| (0xED, 0x80..=0x9F)
|
||||
| (0xEE..=0xEF, 0x80..=0xBF) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
}
|
||||
4 => {
|
||||
match (first, next!()) {
|
||||
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(3))
|
||||
}
|
||||
}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
index += 1;
|
||||
} else {
|
||||
// Ascii case, try to skip forward quickly.
|
||||
// When the pointer is aligned, read 2 words of data per iteration
|
||||
// until we find a word containing a non-ascii byte.
|
||||
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
|
||||
let ptr = v.as_ptr();
|
||||
while index < blocks_end {
|
||||
// SAFETY: since `align - index` and `ascii_block_size` are
|
||||
// multiples of `usize_bytes`, `block = ptr.add(index)` is
|
||||
// always aligned with a `usize` so it's safe to dereference
|
||||
// both `block` and `block.offset(1)`.
|
||||
unsafe {
|
||||
let block = ptr.add(index) as *const usize;
|
||||
// break if there is a nonascii byte
|
||||
let zu = contains_nonascii(*block);
|
||||
let zv = contains_nonascii(*block.offset(1));
|
||||
if zu | zv {
|
||||
break;
|
||||
}
|
||||
}
|
||||
index += ascii_block_size;
|
||||
}
|
||||
// step from the point where the wordwise loop stopped
|
||||
while index < len && v[index] < 128 {
|
||||
index += 1;
|
||||
}
|
||||
} else {
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
static UTF8_CHAR_WIDTH: [u8; 256] = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x3F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x5F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x7F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, // 0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, // 0xBF
|
||||
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, // 0xDF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
|
||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
|
||||
];
|
||||
|
||||
/// Given a first byte, determines how many bytes are in this UTF-8 character.
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
#[inline]
|
||||
pub fn utf8_char_width(b: u8) -> usize {
|
||||
UTF8_CHAR_WIDTH[b as usize] as usize
|
||||
}
|
||||
|
||||
/// Mask of the value bits of a continuation byte.
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000;
|
||||
|
||||
// truncate `&str` to length at most equal to `max`
|
||||
// return `true` if it were truncated, and the new str.
|
||||
fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
|
||||
if max >= s.len() {
|
||||
(false, s)
|
||||
} else {
|
||||
while !s.is_char_boundary(max) {
|
||||
max -= 1;
|
||||
}
|
||||
(true, &s[..max])
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
#[cold]
|
||||
#[track_caller]
|
||||
|
275
library/core/src/str/validations.rs
Normal file
275
library/core/src/str/validations.rs
Normal file
@ -0,0 +1,275 @@
|
||||
//! Operations related to UTF-8 validation.
|
||||
|
||||
use crate::mem;
|
||||
|
||||
use super::Utf8Error;
|
||||
|
||||
/// Returns the initial codepoint accumulator for the first byte.
|
||||
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||
/// for width 3, and 3 bits for width 4.
|
||||
#[inline]
|
||||
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
|
||||
(byte & (0x7F >> width)) as u32
|
||||
}
|
||||
|
||||
/// Returns the value of `ch` updated with continuation byte `byte`.
|
||||
#[inline]
|
||||
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
|
||||
(ch << 6) | (byte & CONT_MASK) as u32
|
||||
}
|
||||
|
||||
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
|
||||
/// bits `10`).
|
||||
#[inline]
|
||||
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
|
||||
(byte & !CONT_MASK) == TAG_CONT_U8
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
|
||||
match opt {
|
||||
Some(&byte) => byte,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the next code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
#[inline]
|
||||
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
|
||||
// Decode UTF-8
|
||||
let x = *bytes.next()?;
|
||||
if x < 128 {
|
||||
return Some(x as u32);
|
||||
}
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [[[x y] z] w]
|
||||
// NOTE: Performance is sensitive to the exact formulation here
|
||||
let init = utf8_first_byte(x, 2);
|
||||
let y = unwrap_or_0(bytes.next());
|
||||
let mut ch = utf8_acc_cont_byte(init, y);
|
||||
if x >= 0xE0 {
|
||||
// [[x y z] w] case
|
||||
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
|
||||
let z = unwrap_or_0(bytes.next());
|
||||
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
|
||||
ch = init << 12 | y_z;
|
||||
if x >= 0xF0 {
|
||||
// [x y z w] case
|
||||
// use only the lower 3 bits of `init`
|
||||
let w = unwrap_or_0(bytes.next());
|
||||
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
|
||||
}
|
||||
}
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
/// Reads the last code point out of a byte iterator (assuming a
|
||||
/// UTF-8-like encoding).
|
||||
#[inline]
|
||||
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
|
||||
where
|
||||
I: DoubleEndedIterator<Item = &'a u8>,
|
||||
{
|
||||
// Decode UTF-8
|
||||
let w = match *bytes.next_back()? {
|
||||
next_byte if next_byte < 128 => return Some(next_byte as u32),
|
||||
back_byte => back_byte,
|
||||
};
|
||||
|
||||
// Multibyte case follows
|
||||
// Decode from a byte combination out of: [x [y [z w]]]
|
||||
let mut ch;
|
||||
let z = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(z, 2);
|
||||
if utf8_is_cont_byte(z) {
|
||||
let y = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(y, 3);
|
||||
if utf8_is_cont_byte(y) {
|
||||
let x = unwrap_or_0(bytes.next_back());
|
||||
ch = utf8_first_byte(x, 4);
|
||||
ch = utf8_acc_cont_byte(ch, y);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, z);
|
||||
}
|
||||
ch = utf8_acc_cont_byte(ch, w);
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
// use truncation to fit u64 into usize
|
||||
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
|
||||
|
||||
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
|
||||
#[inline]
|
||||
fn contains_nonascii(x: usize) -> bool {
|
||||
(x & NONASCII_MASK) != 0
|
||||
}
|
||||
|
||||
/// Walks through `v` checking that it's a valid UTF-8 sequence,
|
||||
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
|
||||
#[inline(always)]
|
||||
pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
let mut index = 0;
|
||||
let len = v.len();
|
||||
|
||||
let usize_bytes = mem::size_of::<usize>();
|
||||
let ascii_block_size = 2 * usize_bytes;
|
||||
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
|
||||
let align = v.as_ptr().align_offset(usize_bytes);
|
||||
|
||||
while index < len {
|
||||
let old_offset = index;
|
||||
macro_rules! err {
|
||||
($error_len: expr) => {
|
||||
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! next {
|
||||
() => {{
|
||||
index += 1;
|
||||
// we needed data, but there was none: error!
|
||||
if index >= len {
|
||||
err!(None)
|
||||
}
|
||||
v[index]
|
||||
}};
|
||||
}
|
||||
|
||||
let first = v[index];
|
||||
if first >= 128 {
|
||||
let w = UTF8_CHAR_WIDTH[first as usize];
|
||||
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// excluding surrogates codepoints \u{d800} to \u{dfff}
|
||||
// ED A0 80 to ED BF BF
|
||||
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
match w {
|
||||
2 => {
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(1))
|
||||
}
|
||||
}
|
||||
3 => {
|
||||
match (first, next!()) {
|
||||
(0xE0, 0xA0..=0xBF)
|
||||
| (0xE1..=0xEC, 0x80..=0xBF)
|
||||
| (0xED, 0x80..=0x9F)
|
||||
| (0xEE..=0xEF, 0x80..=0xBF) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
}
|
||||
4 => {
|
||||
match (first, next!()) {
|
||||
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(3))
|
||||
}
|
||||
}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
index += 1;
|
||||
} else {
|
||||
// Ascii case, try to skip forward quickly.
|
||||
// When the pointer is aligned, read 2 words of data per iteration
|
||||
// until we find a word containing a non-ascii byte.
|
||||
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
|
||||
let ptr = v.as_ptr();
|
||||
while index < blocks_end {
|
||||
// SAFETY: since `align - index` and `ascii_block_size` are
|
||||
// multiples of `usize_bytes`, `block = ptr.add(index)` is
|
||||
// always aligned with a `usize` so it's safe to dereference
|
||||
// both `block` and `block.offset(1)`.
|
||||
unsafe {
|
||||
let block = ptr.add(index) as *const usize;
|
||||
// break if there is a nonascii byte
|
||||
let zu = contains_nonascii(*block);
|
||||
let zv = contains_nonascii(*block.offset(1));
|
||||
if zu | zv {
|
||||
break;
|
||||
}
|
||||
}
|
||||
index += ascii_block_size;
|
||||
}
|
||||
// step from the point where the wordwise loop stopped
|
||||
while index < len && v[index] < 128 {
|
||||
index += 1;
|
||||
}
|
||||
} else {
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
static UTF8_CHAR_WIDTH: [u8; 256] = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x3F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x5F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, // 0x7F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, // 0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, // 0xBF
|
||||
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, // 0xDF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
|
||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
|
||||
];
|
||||
|
||||
/// Given a first byte, determines how many bytes are in this UTF-8 character.
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
#[inline]
|
||||
pub fn utf8_char_width(b: u8) -> usize {
|
||||
UTF8_CHAR_WIDTH[b as usize] as usize
|
||||
}
|
||||
|
||||
/// Mask of the value bits of a continuation byte.
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000;
|
||||
|
||||
// truncate `&str` to length at most equal to `max`
|
||||
// return `true` if it were truncated, and the new str.
|
||||
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
|
||||
if max >= s.len() {
|
||||
(false, s)
|
||||
} else {
|
||||
while !s.is_char_boundary(max) {
|
||||
max -= 1;
|
||||
}
|
||||
(true, &s[..max])
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user