mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-25 16:24:46 +00:00
Auto merge of #30740 - bluss:ascii-is-the-best, r=brson
Add fast path for ASCII in UTF-8 validation This speeds up the ASCII case (and long stretches of ASCII in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14 (smallish input). On XML and mostly English language input (en.wikipedia XML dump), throughput improves by a factor 7 (large input). On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, bounds checks are statically elided by LLVM and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ASCII (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia XML dump - `enwik8` is 100MB of an en.wikipedia XML dump - `jawik10` is 10MB of a ja.wikipedia XML dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 10,944 ns/iter (+/- 795) = 468 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
This commit is contained in:
commit
e7e4ecc522
@ -479,6 +479,18 @@ fn test_is_utf8() {
|
|||||||
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
|
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_utf8_mostly_ascii() {
|
||||||
|
// deny invalid bytes embedded in long stretches of ascii
|
||||||
|
for i in 32..64 {
|
||||||
|
let mut data = [0; 128];
|
||||||
|
data[i] = 0xC0;
|
||||||
|
assert!(from_utf8(&data).is_err());
|
||||||
|
data[i] = 0xC2;
|
||||||
|
assert!(from_utf8(&data).is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_is_utf16() {
|
fn test_is_utf16() {
|
||||||
use rustc_unicode::str::is_utf16;
|
use rustc_unicode::str::is_utf16;
|
||||||
|
@ -32,6 +32,7 @@ use option::Option::{self, None, Some};
|
|||||||
use raw::{Repr, Slice};
|
use raw::{Repr, Slice};
|
||||||
use result::Result::{self, Ok, Err};
|
use result::Result::{self, Ok, Err};
|
||||||
use slice::{self, SliceExt};
|
use slice::{self, SliceExt};
|
||||||
|
use usize;
|
||||||
|
|
||||||
pub mod pattern;
|
pub mod pattern;
|
||||||
|
|
||||||
@ -240,7 +241,7 @@ impl Utf8Error {
|
|||||||
/// ```
|
/// ```
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
|
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
|
||||||
try!(run_utf8_validation_iterator(&mut v.iter()));
|
try!(run_utf8_validation(v));
|
||||||
Ok(unsafe { from_utf8_unchecked(v) })
|
Ok(unsafe { from_utf8_unchecked(v) })
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1074,46 +1075,44 @@ unsafe fn cmp_slice(a: &str, b: &str, len: usize) -> i32 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Section: Misc
|
Section: UTF-8 validation
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// use truncation to fit u64 into usize
|
||||||
|
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
|
||||||
|
|
||||||
|
/// Return `true` if any byte in the word `x` is nonascii (>= 128).
|
||||||
|
#[inline]
|
||||||
|
fn contains_nonascii(x: usize) -> bool {
|
||||||
|
(x & NONASCII_MASK) != 0
|
||||||
|
}
|
||||||
|
|
||||||
/// Walk through `iter` checking that it's a valid UTF-8 sequence,
|
/// Walk through `iter` checking that it's a valid UTF-8 sequence,
|
||||||
/// returning `true` in that case, or, if it is invalid, `false` with
|
/// returning `true` in that case, or, if it is invalid, `false` with
|
||||||
/// `iter` reset such that it is pointing at the first byte in the
|
/// `iter` reset such that it is pointing at the first byte in the
|
||||||
/// invalid sequence.
|
/// invalid sequence.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
|
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||||
-> Result<(), Utf8Error> {
|
let mut offset = 0;
|
||||||
let whole = iter.as_slice();
|
let len = v.len();
|
||||||
loop {
|
while offset < len {
|
||||||
// save the current thing we're pointing at.
|
let old_offset = offset;
|
||||||
let old = iter.clone();
|
|
||||||
|
|
||||||
// restore the iterator we had at the start of this codepoint.
|
|
||||||
macro_rules! err { () => {{
|
macro_rules! err { () => {{
|
||||||
*iter = old.clone();
|
|
||||||
return Err(Utf8Error {
|
return Err(Utf8Error {
|
||||||
valid_up_to: whole.len() - iter.as_slice().len()
|
valid_up_to: old_offset
|
||||||
})
|
})
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
macro_rules! next { () => {
|
macro_rules! next { () => {{
|
||||||
match iter.next() {
|
offset += 1;
|
||||||
Some(a) => *a,
|
// we needed data, but there was none: error!
|
||||||
// we needed data, but there was none: error!
|
if offset >= len {
|
||||||
None => err!(),
|
err!()
|
||||||
}
|
}
|
||||||
}}
|
v[offset]
|
||||||
|
}}}
|
||||||
|
|
||||||
let first = match iter.next() {
|
let first = v[offset];
|
||||||
Some(&b) => b,
|
|
||||||
// we're at the end of the iterator and a codepoint
|
|
||||||
// boundary at the same time, so this string is valid.
|
|
||||||
None => return Ok(())
|
|
||||||
};
|
|
||||||
|
|
||||||
// ASCII characters are always valid, so only large
|
|
||||||
// bytes need more examination.
|
|
||||||
if first >= 128 {
|
if first >= 128 {
|
||||||
let w = UTF8_CHAR_WIDTH[first as usize];
|
let w = UTF8_CHAR_WIDTH[first as usize];
|
||||||
let second = next!();
|
let second = next!();
|
||||||
@ -1156,8 +1155,42 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
|
|||||||
}
|
}
|
||||||
_ => err!()
|
_ => err!()
|
||||||
}
|
}
|
||||||
|
offset += 1;
|
||||||
|
} else {
|
||||||
|
// Ascii case, try to skip forward quickly.
|
||||||
|
// When the pointer is aligned, read 2 words of data per iteration
|
||||||
|
// until we find a word containing a non-ascii byte.
|
||||||
|
const BYTES_PER_ITERATION: usize = 2 * usize::BYTES;
|
||||||
|
let ptr = v.as_ptr();
|
||||||
|
let align = (ptr as usize + offset) & (usize::BYTES - 1);
|
||||||
|
if align == 0 {
|
||||||
|
if len >= BYTES_PER_ITERATION {
|
||||||
|
while offset <= len - BYTES_PER_ITERATION {
|
||||||
|
unsafe {
|
||||||
|
let u = *(ptr.offset(offset as isize) as *const usize);
|
||||||
|
let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize);
|
||||||
|
|
||||||
|
// break if there is a nonascii byte
|
||||||
|
let zu = contains_nonascii(u);
|
||||||
|
let zv = contains_nonascii(v);
|
||||||
|
if zu || zv {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offset += BYTES_PER_ITERATION;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// step from the point where the wordwise loop stopped
|
||||||
|
while offset < len && v[offset] < 128 {
|
||||||
|
offset += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
offset += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://tools.ietf.org/html/rfc3629
|
// https://tools.ietf.org/html/rfc3629
|
||||||
|
Loading…
Reference in New Issue
Block a user