mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-24 07:44:10 +00:00
Auto merge of #90821 - scottmcm:new-slice-reverse, r=Mark-Simulacrum
MIRI says `reverse` is UB, so replace it with something LLVM can vectorize
For small types with padding, the current implementation is UB because it does integer operations on uninit values.
```
error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
--> /playground/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/num/mod.rs:836:5
|
836 | / uint_impl! { u32, u32, i32, 32, 4294967295, 8, "0x10000b3", "0xb301", "0x12345678",
837 | | "0x78563412", "0x1e6a2c48", "[0x78, 0x56, 0x34, 0x12]", "[0x12, 0x34, 0x56, 0x78]", "", "" }
| |________________________________________________________________________________________________^ using uninitialized data, but this operation requires initialized memory
|
= help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
= help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
= note: inside `core::num::<impl u32>::rotate_left` at /playground/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/num/uint_macros.rs:211:13
= note: inside `core::slice::<impl [Foo]>::reverse` at /playground/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/slice/mod.rs:701:58
```
<https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=340739f22ca5b457e1da6f361768edc6>
But LLVM has gotten smarter since I wrote the previous implementation in 2017, so this PR removes all the manual magic and just writes it in such a way that LLVM will vectorize. This code is much simpler and has very little `unsafe`, and is actually faster to boot!
If you're curious to see the codegen: <https://rust.godbolt.org/z/Pcn13Y9E3>
Before:
```
running 7 tests
test slice::reverse_simd_f64x4 ... bench: 17,940 ns/iter (+/- 481) = 58448 MB/s
test slice::reverse_u128 ... bench: 17,758 ns/iter (+/- 205) = 59048 MB/s
test slice::reverse_u16 ... bench: 158,234 ns/iter (+/- 6,876) = 6626 MB/s
test slice::reverse_u32 ... bench: 62,047 ns/iter (+/- 1,117) = 16899 MB/s
test slice::reverse_u64 ... bench: 31,582 ns/iter (+/- 552) = 33201 MB/s
test slice::reverse_u8 ... bench: 81,253 ns/iter (+/- 1,510) = 12905 MB/s
test slice::reverse_u8x3 ... bench: 270,615 ns/iter (+/- 11,463) = 3874 MB/s
```
After:
```
running 7 tests
test slice::reverse_simd_f64x4 ... bench: 17,731 ns/iter (+/- 306) = 59137 MB/s
test slice::reverse_u128 ... bench: 17,919 ns/iter (+/- 239) = 58517 MB/s
test slice::reverse_u16 ... bench: 43,160 ns/iter (+/- 607) = 24295 MB/s
test slice::reverse_u32 ... bench: 21,065 ns/iter (+/- 371) = 49778 MB/s
test slice::reverse_u64 ... bench: 21,118 ns/iter (+/- 482) = 49653 MB/s
test slice::reverse_u8 ... bench: 76,878 ns/iter (+/- 1,688) = 13639 MB/s
test slice::reverse_u8x3 ... bench: 264,723 ns/iter (+/- 5,544) = 3961 MB/s
```
Those are the existing benches, <14a2fd640e/library/alloc/benches/slice.rs (L322-L346)
>
This commit is contained in:
commit
891ca5f63c
@ -623,100 +623,41 @@ impl<T> [T] {
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
#[inline]
|
||||
pub fn reverse(&mut self) {
|
||||
let mut i: usize = 0;
|
||||
let ln = self.len();
|
||||
let half_len = self.len() / 2;
|
||||
let Range { start, end } = self.as_mut_ptr_range();
|
||||
|
||||
// For very small types, all the individual reads in the normal
|
||||
// path perform poorly. We can do better, given efficient unaligned
|
||||
// load/store, by loading a larger chunk and reversing a register.
|
||||
|
||||
// Ideally LLVM would do this for us, as it knows better than we do
|
||||
// whether unaligned reads are efficient (since that changes between
|
||||
// different ARM versions, for example) and what the best chunk size
|
||||
// would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
|
||||
// the loop, so we need to do this ourselves. (Hypothesis: reverse
|
||||
// is troublesome because the sides can be aligned differently --
|
||||
// will be, when the length is odd -- so there's no way of emitting
|
||||
// pre- and postludes to use fully-aligned SIMD in the middle.)
|
||||
|
||||
let fast_unaligned = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
|
||||
|
||||
if fast_unaligned && mem::size_of::<T>() == 1 {
|
||||
// Use the llvm.bswap intrinsic to reverse u8s in a usize
|
||||
let chunk = mem::size_of::<usize>();
|
||||
while i + chunk - 1 < ln / 2 {
|
||||
// SAFETY: There are several things to check here:
|
||||
//
|
||||
// - Note that `chunk` is either 4 or 8 due to the cfg check
|
||||
// above. So `chunk - 1` is positive.
|
||||
// - Indexing with index `i` is fine as the loop check guarantees
|
||||
// `i + chunk - 1 < ln / 2`
|
||||
// <=> `i < ln / 2 - (chunk - 1) < ln / 2 < ln`.
|
||||
// - Indexing with index `ln - i - chunk = ln - (i + chunk)` is fine:
|
||||
// - `i + chunk > 0` is trivially true.
|
||||
// - The loop check guarantees:
|
||||
// `i + chunk - 1 < ln / 2`
|
||||
// <=> `i + chunk ≤ ln / 2 ≤ ln`, thus subtraction does not underflow.
|
||||
// - The `read_unaligned` and `write_unaligned` calls are fine:
|
||||
// - `pa` points to index `i` where `i < ln / 2 - (chunk - 1)`
|
||||
// (see above) and `pb` points to index `ln - i - chunk`, so
|
||||
// both are at least `chunk`
|
||||
// many bytes away from the end of `self`.
|
||||
// - Any initialized memory is valid `usize`.
|
||||
unsafe {
|
||||
let ptr = self.as_mut_ptr();
|
||||
let pa = ptr.add(i);
|
||||
let pb = ptr.add(ln - i - chunk);
|
||||
let va = ptr::read_unaligned(pa as *mut usize);
|
||||
let vb = ptr::read_unaligned(pb as *mut usize);
|
||||
ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
|
||||
ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
|
||||
}
|
||||
i += chunk;
|
||||
}
|
||||
}
|
||||
|
||||
if fast_unaligned && mem::size_of::<T>() == 2 {
|
||||
// Use rotate-by-16 to reverse u16s in a u32
|
||||
let chunk = mem::size_of::<u32>() / 2;
|
||||
while i + chunk - 1 < ln / 2 {
|
||||
// SAFETY: An unaligned u32 can be read from `i` if `i + 1 < ln`
|
||||
// (and obviously `i < ln`), because each element is 2 bytes and
|
||||
// we're reading 4.
|
||||
//
|
||||
// `i + chunk - 1 < ln / 2` # while condition
|
||||
// `i + 2 - 1 < ln / 2`
|
||||
// `i + 1 < ln / 2`
|
||||
//
|
||||
// Since it's less than the length divided by 2, then it must be
|
||||
// in bounds.
|
||||
//
|
||||
// This also means that the condition `0 < i + chunk <= ln` is
|
||||
// always respected, ensuring the `pb` pointer can be used
|
||||
// safely.
|
||||
unsafe {
|
||||
let ptr = self.as_mut_ptr();
|
||||
let pa = ptr.add(i);
|
||||
let pb = ptr.add(ln - i - chunk);
|
||||
let va = ptr::read_unaligned(pa as *mut u32);
|
||||
let vb = ptr::read_unaligned(pb as *mut u32);
|
||||
ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
|
||||
ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
|
||||
}
|
||||
i += chunk;
|
||||
}
|
||||
}
|
||||
|
||||
while i < ln / 2 {
|
||||
// SAFETY: `i` is inferior to half the length of the slice so
|
||||
// accessing `i` and `ln - i - 1` is safe (`i` starts at 0 and
|
||||
// will not go further than `ln / 2 - 1`).
|
||||
// The resulting pointers `pa` and `pb` are therefore valid and
|
||||
// aligned, and can be read from and written to.
|
||||
// These slices will skip the middle item for an odd length,
|
||||
// since that one doesn't need to move.
|
||||
let (front_half, back_half) =
|
||||
// SAFETY: Both are subparts of the original slice, so the memory
|
||||
// range is valid, and they don't overlap because they're each only
|
||||
// half (or less) of the original slice.
|
||||
unsafe {
|
||||
self.swap_unchecked(i, ln - i - 1);
|
||||
(
|
||||
slice::from_raw_parts_mut(start, half_len),
|
||||
slice::from_raw_parts_mut(end.sub(half_len), half_len),
|
||||
)
|
||||
};
|
||||
|
||||
// Introducing a function boundary here means that the two halves
|
||||
// get `noalias` markers, allowing better optimization as LLVM
|
||||
// knows that they're disjoint, unlike in the original slice.
|
||||
revswap(front_half, back_half, half_len);
|
||||
|
||||
#[inline]
|
||||
fn revswap<T>(a: &mut [T], b: &mut [T], n: usize) {
|
||||
debug_assert_eq!(a.len(), n);
|
||||
debug_assert_eq!(b.len(), n);
|
||||
|
||||
// Because this function is first compiled in isolation,
|
||||
// this check tells LLVM that the indexing below is
|
||||
// in-bounds. Then after inlining -- once the actual
|
||||
// lengths of the slices are known -- it's removed.
|
||||
let (a, b) = (&mut a[..n], &mut b[..n]);
|
||||
|
||||
for i in 0..n {
|
||||
mem::swap(&mut a[i], &mut b[n - 1 - i]);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
27
src/test/codegen/slice-reverse.rs
Normal file
27
src/test/codegen/slice-reverse.rs
Normal file
@ -0,0 +1,27 @@
|
||||
// compile-flags: -O
|
||||
// only-x86_64
|
||||
// ignore-debug: the debug assertions in from_raw_parts get in the way
|
||||
|
||||
#![crate_type = "lib"]
|
||||
|
||||
// CHECK-LABEL: @slice_reverse_u8
|
||||
#[no_mangle]
|
||||
pub fn slice_reverse_u8(slice: &mut [u8]) {
|
||||
// CHECK-NOT: panic_bounds_check
|
||||
// CHECK-NOT: slice_end_index_len_fail
|
||||
// CHECK: shufflevector <{{[0-9]+}} x i8>
|
||||
// CHECK-NOT: panic_bounds_check
|
||||
// CHECK-NOT: slice_end_index_len_fail
|
||||
slice.reverse();
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @slice_reverse_i32
|
||||
#[no_mangle]
|
||||
pub fn slice_reverse_i32(slice: &mut [i32]) {
|
||||
// CHECK-NOT: panic_bounds_check
|
||||
// CHECK-NOT: slice_end_index_len_fail
|
||||
// CHECK: shufflevector <{{[0-9]+}} x i32>
|
||||
// CHECK-NOT: panic_bounds_check
|
||||
// CHECK-NOT: slice_end_index_len_fail
|
||||
slice.reverse();
|
||||
}
|
Loading…
Reference in New Issue
Block a user