Auto merge of #118273 - AngelicosPhosphoros:dedup_2_loops_version_77772_2, r=the8472

Split `Vec::dedup_by` into 2 cycles

First cycle runs until we found 2 same elements, second runs after if there any found in the first one. This allows to avoid any memory writes until we found an item which we want to remove.

This leads to significant performance gains if all `Vec` items are kept: -40% on my benchmark with unique integers.

Results of benchmarks before implementation (including new benchmark where nothing needs to be removed):
 *   vec::bench_dedup_all_100                 74.00ns/iter  +/- 13.00ns
 *   vec::bench_dedup_all_1000               572.00ns/iter +/- 272.00ns
 *   vec::bench_dedup_all_100000              64.42µs/iter  +/- 19.47µs
 *   __vec::bench_dedup_none_100                67.00ns/iter  +/- 17.00ns__
 *   __vec::bench_dedup_none_1000              662.00ns/iter  +/- 86.00ns__
 *   __vec::bench_dedup_none_10000               9.16µs/iter   +/- 2.71µs__
 *   __vec::bench_dedup_none_100000             91.25µs/iter   +/- 1.82µs__
 *   vec::bench_dedup_random_100             105.00ns/iter  +/- 11.00ns
 *   vec::bench_dedup_random_1000            781.00ns/iter  +/- 10.00ns
 *   vec::bench_dedup_random_10000             9.00µs/iter   +/- 5.62µs
 *   vec::bench_dedup_random_100000          449.81µs/iter  +/- 74.99µs
 *   vec::bench_dedup_slice_truncate_100     105.00ns/iter  +/- 16.00ns
 *   vec::bench_dedup_slice_truncate_1000      2.65µs/iter +/- 481.00ns
 *   vec::bench_dedup_slice_truncate_10000    18.33µs/iter   +/- 5.23µs
 *   vec::bench_dedup_slice_truncate_100000  501.12µs/iter  +/- 46.97µs

Results after implementation:
 *   vec::bench_dedup_all_100                 75.00ns/iter   +/- 9.00ns
 *   vec::bench_dedup_all_1000               494.00ns/iter +/- 117.00ns
 *   vec::bench_dedup_all_100000              58.13µs/iter   +/- 8.78µs
 *   __vec::bench_dedup_none_100                52.00ns/iter  +/- 22.00ns__
 *   __vec::bench_dedup_none_1000              417.00ns/iter +/- 116.00ns__
 *   __vec::bench_dedup_none_10000               4.11µs/iter +/- 546.00ns__
 *   __vec::bench_dedup_none_100000             40.47µs/iter   +/- 5.36µs__
 *   vec::bench_dedup_random_100              77.00ns/iter  +/- 15.00ns
 *   vec::bench_dedup_random_1000            681.00ns/iter  +/- 86.00ns
 *   vec::bench_dedup_random_10000            11.66µs/iter   +/- 2.22µs
 *   vec::bench_dedup_random_100000          469.35µs/iter  +/- 20.53µs
 *   vec::bench_dedup_slice_truncate_100     100.00ns/iter   +/- 5.00ns
 *   vec::bench_dedup_slice_truncate_1000      2.55µs/iter +/- 224.00ns
 *   vec::bench_dedup_slice_truncate_10000    18.95µs/iter   +/- 2.59µs
 *   vec::bench_dedup_slice_truncate_100000  492.85µs/iter  +/- 72.84µs

Resolves #77772

P.S. Note that this is same PR as #92104 I just missed review then forgot about it.
Also, I cannot reopen that pull request so I am creating a new one.
I responded to remaining questions directly by adding commentaries to my code.
This commit is contained in:
bors 2023-12-05 21:40:02 +00:00
commit e9013ac0e4
2 changed files with 155 additions and 34 deletions

View File

@ -658,13 +658,17 @@ fn random_sorted_fill(mut seed: u32, buf: &mut [u32]) {
buf.sort(); buf.sort();
} }
fn bench_vec_dedup_old(b: &mut Bencher, sz: usize) { // Measures performance of slice dedup impl.
// This was used to justify separate implementation of dedup for Vec.
// This algorithm was used for Vecs prior to Rust 1.52.
fn bench_dedup_slice_truncate(b: &mut Bencher, sz: usize) {
let mut template = vec![0u32; sz]; let mut template = vec![0u32; sz];
b.bytes = std::mem::size_of_val(template.as_slice()) as u64; b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
random_sorted_fill(0x43, &mut template); random_sorted_fill(0x43, &mut template);
let mut vec = template.clone(); let mut vec = template.clone();
b.iter(|| { b.iter(|| {
let vec = black_box(&mut vec);
let len = { let len = {
let (dedup, _) = vec.partition_dedup(); let (dedup, _) = vec.partition_dedup();
dedup.len() dedup.len()
@ -672,59 +676,143 @@ fn bench_vec_dedup_old(b: &mut Bencher, sz: usize) {
vec.truncate(len); vec.truncate(len);
black_box(vec.first()); black_box(vec.first());
let vec = black_box(vec);
vec.clear(); vec.clear();
vec.extend_from_slice(&template); vec.extend_from_slice(&template);
}); });
} }
fn bench_vec_dedup_new(b: &mut Bencher, sz: usize) { // Measures performance of Vec::dedup on random data.
fn bench_vec_dedup_random(b: &mut Bencher, sz: usize) {
let mut template = vec![0u32; sz]; let mut template = vec![0u32; sz];
b.bytes = std::mem::size_of_val(template.as_slice()) as u64; b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
random_sorted_fill(0x43, &mut template); random_sorted_fill(0x43, &mut template);
let mut vec = template.clone(); let mut vec = template.clone();
b.iter(|| { b.iter(|| {
let vec = black_box(&mut vec);
vec.dedup(); vec.dedup();
black_box(vec.first()); black_box(vec.first());
let vec = black_box(vec);
vec.clear();
vec.extend_from_slice(&template);
});
}
// Measures performance of Vec::dedup when there is no items removed
fn bench_vec_dedup_none(b: &mut Bencher, sz: usize) {
let mut template = vec![0u32; sz];
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
template.chunks_exact_mut(2).for_each(|w| {
w[0] = black_box(0);
w[1] = black_box(5);
});
let mut vec = template.clone();
b.iter(|| {
let vec = black_box(&mut vec);
vec.dedup();
black_box(vec.first());
// Unlike other benches of `dedup`
// this doesn't reinitialize vec
// because we measure how efficient dedup is
// when no memory written
});
}
// Measures performance of Vec::dedup when there is all items removed
fn bench_vec_dedup_all(b: &mut Bencher, sz: usize) {
let mut template = vec![0u32; sz];
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
template.iter_mut().for_each(|w| {
*w = black_box(0);
});
let mut vec = template.clone();
b.iter(|| {
let vec = black_box(&mut vec);
vec.dedup();
black_box(vec.first());
let vec = black_box(vec);
vec.clear(); vec.clear();
vec.extend_from_slice(&template); vec.extend_from_slice(&template);
}); });
} }
#[bench] #[bench]
fn bench_dedup_old_100(b: &mut Bencher) { fn bench_dedup_slice_truncate_100(b: &mut Bencher) {
bench_vec_dedup_old(b, 100); bench_dedup_slice_truncate(b, 100);
} }
#[bench] #[bench]
fn bench_dedup_new_100(b: &mut Bencher) { fn bench_dedup_random_100(b: &mut Bencher) {
bench_vec_dedup_new(b, 100); bench_vec_dedup_random(b, 100);
} }
#[bench] #[bench]
fn bench_dedup_old_1000(b: &mut Bencher) { fn bench_dedup_none_100(b: &mut Bencher) {
bench_vec_dedup_old(b, 1000); bench_vec_dedup_none(b, 100);
}
#[bench]
fn bench_dedup_new_1000(b: &mut Bencher) {
bench_vec_dedup_new(b, 1000);
} }
#[bench] #[bench]
fn bench_dedup_old_10000(b: &mut Bencher) { fn bench_dedup_all_100(b: &mut Bencher) {
bench_vec_dedup_old(b, 10000); bench_vec_dedup_all(b, 100);
}
#[bench]
fn bench_dedup_new_10000(b: &mut Bencher) {
bench_vec_dedup_new(b, 10000);
} }
#[bench] #[bench]
fn bench_dedup_old_100000(b: &mut Bencher) { fn bench_dedup_slice_truncate_1000(b: &mut Bencher) {
bench_vec_dedup_old(b, 100000); bench_dedup_slice_truncate(b, 1000);
} }
#[bench] #[bench]
fn bench_dedup_new_100000(b: &mut Bencher) { fn bench_dedup_random_1000(b: &mut Bencher) {
bench_vec_dedup_new(b, 100000); bench_vec_dedup_random(b, 1000);
}
#[bench]
fn bench_dedup_none_1000(b: &mut Bencher) {
bench_vec_dedup_none(b, 1000);
}
#[bench]
fn bench_dedup_all_1000(b: &mut Bencher) {
bench_vec_dedup_all(b, 1000);
}
#[bench]
fn bench_dedup_slice_truncate_10000(b: &mut Bencher) {
bench_dedup_slice_truncate(b, 10000);
}
#[bench]
fn bench_dedup_random_10000(b: &mut Bencher) {
bench_vec_dedup_random(b, 10000);
}
#[bench]
fn bench_dedup_none_10000(b: &mut Bencher) {
bench_vec_dedup_none(b, 10000);
}
#[bench]
fn bench_dedup_all_10000(b: &mut Bencher) {
bench_vec_dedup_all(b, 10000);
}
#[bench]
fn bench_dedup_slice_truncate_100000(b: &mut Bencher) {
bench_dedup_slice_truncate(b, 100000);
}
#[bench]
fn bench_dedup_random_100000(b: &mut Bencher) {
bench_vec_dedup_random(b, 100000);
}
#[bench]
fn bench_dedup_none_100000(b: &mut Bencher) {
bench_vec_dedup_none(b, 100000);
}
#[bench]
fn bench_dedup_all_100000(b: &mut Bencher) {
bench_vec_dedup_all(b, 100000);
} }
#[bench] #[bench]

View File

@ -1777,7 +1777,32 @@ impl<T, A: Allocator> Vec<T, A> {
return; return;
} }
/* INVARIANT: vec.len() > read >= write > write-1 >= 0 */ // Check if we ever want to remove anything.
// This allows to use copy_non_overlapping in next cycle.
// And avoids any memory writes if we don't need to remove anything.
let mut first_duplicate_idx: usize = 1;
let start = self.as_mut_ptr();
while first_duplicate_idx != len {
let found_duplicate = unsafe {
// SAFETY: first_duplicate always in range [1..len)
// Note that we start iteration from 1 so we never overflow.
let prev = start.add(first_duplicate_idx.wrapping_sub(1));
let current = start.add(first_duplicate_idx);
// We explicitly say in docs that references are reversed.
same_bucket(&mut *current, &mut *prev)
};
if found_duplicate {
break;
}
first_duplicate_idx += 1;
}
// Don't need to remove anything.
// We cannot get bigger than len.
if first_duplicate_idx == len {
return;
}
/* INVARIANT: vec.len() > read > write > write-1 >= 0 */
struct FillGapOnDrop<'a, T, A: core::alloc::Allocator> { struct FillGapOnDrop<'a, T, A: core::alloc::Allocator> {
/* Offset of the element we want to check if it is duplicate */ /* Offset of the element we want to check if it is duplicate */
read: usize, read: usize,
@ -1823,31 +1848,39 @@ impl<T, A: Allocator> Vec<T, A> {
} }
} }
let mut gap = FillGapOnDrop { read: 1, write: 1, vec: self };
let ptr = gap.vec.as_mut_ptr();
/* Drop items while going through Vec, it should be more efficient than /* Drop items while going through Vec, it should be more efficient than
* doing slice partition_dedup + truncate */ * doing slice partition_dedup + truncate */
// Construct gap first and then drop item to avoid memory corruption if `T::drop` panics.
let mut gap =
FillGapOnDrop { read: first_duplicate_idx + 1, write: first_duplicate_idx, vec: self };
unsafe {
// SAFETY: we checked that first_duplicate_idx in bounds before.
// If drop panics, `gap` would remove this item without drop.
ptr::drop_in_place(start.add(first_duplicate_idx));
}
/* SAFETY: Because of the invariant, read_ptr, prev_ptr and write_ptr /* SAFETY: Because of the invariant, read_ptr, prev_ptr and write_ptr
* are always in-bounds and read_ptr never aliases prev_ptr */ * are always in-bounds and read_ptr never aliases prev_ptr */
unsafe { unsafe {
while gap.read < len { while gap.read < len {
let read_ptr = ptr.add(gap.read); let read_ptr = start.add(gap.read);
let prev_ptr = ptr.add(gap.write.wrapping_sub(1)); let prev_ptr = start.add(gap.write.wrapping_sub(1));
if same_bucket(&mut *read_ptr, &mut *prev_ptr) { // We explicitly say in docs that references are reversed.
let found_duplicate = same_bucket(&mut *read_ptr, &mut *prev_ptr);
if found_duplicate {
// Increase `gap.read` now since the drop may panic. // Increase `gap.read` now since the drop may panic.
gap.read += 1; gap.read += 1;
/* We have found duplicate, drop it in-place */ /* We have found duplicate, drop it in-place */
ptr::drop_in_place(read_ptr); ptr::drop_in_place(read_ptr);
} else { } else {
let write_ptr = ptr.add(gap.write); let write_ptr = start.add(gap.write);
/* Because `read_ptr` can be equal to `write_ptr`, we either /* read_ptr cannot be equal to write_ptr because at this point
* have to use `copy` or conditional `copy_nonoverlapping`. * we guaranteed to skip at least one element (before loop starts).
* Looks like the first option is faster. */ */
ptr::copy(read_ptr, write_ptr, 1); ptr::copy_nonoverlapping(read_ptr, write_ptr, 1);
/* We have filled that place, so go further */ /* We have filled that place, so go further */
gap.write += 1; gap.write += 1;