mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-27 01:04:03 +00:00
std: convert first_non_utf8_byte to use the iterator.
This makes it very slightly faster, especially when the string is valid UTF-8, and completely removes the use of `unsafe` from the first half. Before: from_utf8_lossy_100_ascii ... bench: 151 ns/iter (+/- 17) from_utf8_lossy_100_invalid ... bench: 447 ns/iter (+/- 33) from_utf8_lossy_100_multibyte ... bench: 135 ns/iter (+/- 4) from_utf8_lossy_invalid ... bench: 124 ns/iter (+/- 10 After: from_utf8_lossy_100_ascii ... bench: 119 ns/iter (+/- 8) from_utf8_lossy_100_invalid ... bench: 454 ns/iter (+/- 16) from_utf8_lossy_100_multibyte ... bench: 116 ns/iter (+/- 9) from_utf8_lossy_invalid ... bench: 119 ns/iter (+/- 9)
This commit is contained in:
parent
a68d10e6ad
commit
a39056e614
@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {
|
||||
|
||||
#[inline(always)]
|
||||
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
|
||||
let mut i = 0u;
|
||||
let total = v.len();
|
||||
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
|
||||
unsafe { *xs.unsafe_ref(i) }
|
||||
let mut it = v.iter();
|
||||
|
||||
let ok = run_utf8_validation_iterator(&mut it);
|
||||
if ok {
|
||||
None
|
||||
} else {
|
||||
// work out how many valid bytes we've consumed
|
||||
// (run_utf8_validation_iterator resets the iterator to just
|
||||
// after the last good byte), which we can do because the
|
||||
// vector iterator size_hint is exact.
|
||||
let (remaining, _) = it.size_hint();
|
||||
Some(v.len() - remaining)
|
||||
}
|
||||
while i < total {
|
||||
let v_i = unsafe_get(v, i);
|
||||
if v_i < 128u8 {
|
||||
i += 1u;
|
||||
} else {
|
||||
let w = utf8_char_width(v_i);
|
||||
if w == 0u { return Some(i); }
|
||||
|
||||
let nexti = i + w;
|
||||
if nexti > total { return Some(i); }
|
||||
|
||||
// 2-byte encoding is for codepoints \u0080 to \u07ff
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u0800 to \uffff
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// excluding surrogates codepoints \ud800 to \udfff
|
||||
// ED A0 80 to ED BF BF
|
||||
// 4-byte encoding is for codepoints \u10000 to \u10ffff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
// UTF8-tail = %x80-BF
|
||||
match w {
|
||||
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
|
||||
return Some(i)
|
||||
},
|
||||
3 => match (v_i,
|
||||
unsafe_get(v, i + 1),
|
||||
unsafe_get(v, i + 2) & 192u8) {
|
||||
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
|
||||
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
|
||||
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
|
||||
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
|
||||
_ => return Some(i),
|
||||
},
|
||||
_ => match (v_i,
|
||||
unsafe_get(v, i + 1),
|
||||
unsafe_get(v, i + 2) & 192u8,
|
||||
unsafe_get(v, i + 3) & 192u8) {
|
||||
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
_ => return Some(i)
|
||||
},
|
||||
}
|
||||
|
||||
i = nexti;
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Determines if a vector of `u16` contains valid UTF-16
|
||||
|
Loading…
Reference in New Issue
Block a user