mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-22 23:04:33 +00:00
Auto merge of #129587 - Voultapher:opt-for-size-variants-of-sort-impls, r=cuviper
Add `optimize_for_size` variants for stable and unstable sort as well as select_nth_unstable - Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function. - Unstable sort jumps directly to the branchless heapsort fallback. - select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl. Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module. Part of https://github.com/rust-lang/rust/issues/125612 r? `@Kobzol`
This commit is contained in:
commit
363ae41883
@ -7,6 +7,7 @@
|
||||
//! better performance than one would get using heapsort as fallback.
|
||||
|
||||
use crate::mem::{self, SizedTypeProperties};
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
use crate::slice::sort::unstable::quicksort::partition;
|
||||
@ -40,7 +41,13 @@ where
|
||||
let min_idx = min_index(v, &mut is_less).unwrap();
|
||||
v.swap(min_idx, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, None, &mut is_less);
|
||||
cfg_if! {
|
||||
if #[cfg(feature = "optimize_for_size")] {
|
||||
median_of_medians(v, &mut is_less, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, None, &mut is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (left, right) = v.split_at_mut(index);
|
||||
@ -53,6 +60,7 @@ where
|
||||
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
|
||||
const INSERTION_SORT_THRESHOLD: usize = 16;
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
fn partition_at_index_loop<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut index: usize,
|
||||
@ -169,6 +177,7 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
#![cfg_attr(feature = "optimize_for_size", allow(dead_code))]
|
||||
|
||||
use crate::marker::Freeze;
|
||||
|
||||
pub(crate) mod pivot;
|
||||
|
@ -378,7 +378,12 @@ where
|
||||
|
||||
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
|
||||
/// value at position `b_pos` is less than the one at position `a_pos`.
|
||||
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
|
||||
///
|
||||
/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
|
||||
/// types. `is_less` could be a huge function and we want to give the compiler an option to
|
||||
/// not inline this function. For the same reasons that this function is very perf critical
|
||||
/// it should be in the same module as the functions that use it.
|
||||
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
|
@ -1,15 +1,24 @@
|
||||
//! This module contains the entry points for `slice::sort`.
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::cmp;
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::smallsort::{
|
||||
SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left,
|
||||
};
|
||||
use crate::{cmp, intrinsics};
|
||||
|
||||
pub(crate) mod drift;
|
||||
pub(crate) mod merge;
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
pub(crate) mod drift;
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
pub(crate) mod quicksort;
|
||||
|
||||
#[cfg(feature = "optimize_for_size")]
|
||||
pub(crate) mod tiny;
|
||||
|
||||
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
|
||||
/// Design document:
|
||||
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
|
||||
@ -30,25 +39,53 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
cfg_if! {
|
||||
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
|
||||
let alloc_len = len / 2;
|
||||
|
||||
driftsort_main::<T, F, BufT>(v, is_less);
|
||||
cfg_if! {
|
||||
if #[cfg(target_pointer_width = "16")] {
|
||||
let heap_buf = BufT::with_capacity(alloc_len);
|
||||
let scratch = heap_buf.as_uninit_slice_mut();
|
||||
} else {
|
||||
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
|
||||
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
|
||||
let mut stack_buf = AlignedStorage::<T, 4096>::new();
|
||||
let stack_scratch = stack_buf.as_uninit_slice_mut();
|
||||
let mut heap_buf;
|
||||
let scratch = if stack_scratch.len() >= alloc_len {
|
||||
stack_scratch
|
||||
} else {
|
||||
heap_buf = BufT::with_capacity(alloc_len);
|
||||
heap_buf.as_uninit_slice_mut()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
tiny::mergesort(v, scratch, is_less);
|
||||
} else {
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
driftsort_main::<T, F, BufT>(v, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
#[inline(never)]
|
||||
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
|
||||
// By allocating n elements of memory we can ensure the entire input can
|
||||
|
41
library/core/src/slice/sort/stable/tiny.rs
Normal file
41
library/core/src/slice/sort/stable/tiny.rs
Normal file
@ -0,0 +1,41 @@
|
||||
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
|
||||
|
||||
use crate::mem::MaybeUninit;
|
||||
use crate::ptr;
|
||||
use crate::slice::sort::stable::merge;
|
||||
|
||||
/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
|
||||
/// no run detection, etc.
|
||||
#[inline(always)]
|
||||
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
|
||||
if len > 2 {
|
||||
let mid = len / 2;
|
||||
|
||||
// SAFETY: mid is in-bounds.
|
||||
unsafe {
|
||||
// Sort the left half recursively.
|
||||
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
|
||||
// Sort the right half recursively.
|
||||
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
|
||||
}
|
||||
|
||||
merge::merge(v, scratch, mid, is_less);
|
||||
} else if len == 2 {
|
||||
// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
|
||||
unsafe {
|
||||
let v_base = v.as_mut_ptr();
|
||||
let v_a = v_base;
|
||||
let v_b = v_base.add(1);
|
||||
|
||||
if is_less(&*v_b, &*v_a) {
|
||||
ptr::swap_nonoverlapping(v_a, v_b, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,46 +1,46 @@
|
||||
//! This module contains a branchless heapsort as fallback for unstable quicksort.
|
||||
|
||||
use crate::{intrinsics, ptr};
|
||||
use crate::{cmp, intrinsics, ptr};
|
||||
|
||||
/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
|
||||
/// fallback.
|
||||
///
|
||||
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
|
||||
#[inline(never)]
|
||||
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
pub(crate) fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(v.len() >= 2);
|
||||
let len = v.len();
|
||||
|
||||
// Build the heap in linear time.
|
||||
for i in (0..v.len() / 2).rev() {
|
||||
sift_down(v, i, is_less);
|
||||
}
|
||||
|
||||
// Pop maximal elements from the heap.
|
||||
for i in (1..v.len()).rev() {
|
||||
for i in (0..len + len / 2).rev() {
|
||||
let sift_idx = if i >= len {
|
||||
i - len
|
||||
} else {
|
||||
v.swap(0, i);
|
||||
sift_down(&mut v[..i], 0, is_less);
|
||||
0
|
||||
};
|
||||
|
||||
// SAFETY: The above calculation ensures that `sift_idx` is either 0 or
|
||||
// `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`.
|
||||
// This guarantees the required `sift_idx <= len`.
|
||||
unsafe {
|
||||
sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This binary heap respects the invariant `parent >= child`.
|
||||
//
|
||||
// SAFETY: The caller has to guarantee that node < `v.len()`.
|
||||
#[inline(never)]
|
||||
// SAFETY: The caller has to guarantee that `node <= v.len()`.
|
||||
#[inline(always)]
|
||||
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(node < v.len());
|
||||
intrinsics::assume(node <= v.len());
|
||||
}
|
||||
|
||||
let len = v.len();
|
||||
@ -69,9 +69,7 @@ where
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap `node` with the greater child, move one step down, and continue sifting. This
|
||||
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
|
||||
ptr::swap(v_base.add(node), v_base.add(child));
|
||||
ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1);
|
||||
}
|
||||
|
||||
node = child;
|
||||
|
@ -2,7 +2,9 @@
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::SizedTypeProperties;
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::find_existing_run;
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
|
||||
pub(crate) mod heapsort;
|
||||
@ -28,25 +30,32 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
cfg_if! {
|
||||
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
|
||||
heapsort::heapsort(v, is_less);
|
||||
} else {
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
ipnsort(v, is_less);
|
||||
ipnsort(v, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
#[inline(never)]
|
||||
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
|
@ -1,8 +1,12 @@
|
||||
//! This module contains an unstable quicksort and two partition implementations.
|
||||
|
||||
use crate::mem::{self, ManuallyDrop};
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
use crate::slice::sort::unstable::heapsort;
|
||||
use crate::{intrinsics, ptr};
|
||||
|
||||
/// Sorts `v` recursively.
|
||||
@ -11,6 +15,7 @@ use crate::{intrinsics, ptr};
|
||||
///
|
||||
/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
|
||||
/// this function will immediately switch to heapsort.
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
pub(crate) fn quicksort<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut ancestor_pivot: Option<&'a T>,
|
||||
@ -28,10 +33,7 @@ pub(crate) fn quicksort<'a, T, F>(
|
||||
// If too many bad pivot choices were made, simply fall back to heapsort in order to
|
||||
// guarantee `O(N x log(N))` worst-case.
|
||||
if limit == 0 {
|
||||
// SAFETY: We assume the `small_sort` threshold is at least 1.
|
||||
unsafe {
|
||||
crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
|
||||
}
|
||||
heapsort::heapsort(v, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -98,13 +100,15 @@ where
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Allows for panic-free code-gen by proving this property to the compiler.
|
||||
if pivot >= len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap(0, pivot);
|
||||
// SAFETY: We checked that `pivot` is in-bounds.
|
||||
unsafe {
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap_unchecked(0, pivot);
|
||||
}
|
||||
let (pivot, v_without_pivot) = v.split_at_mut(1);
|
||||
|
||||
// Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
|
||||
@ -118,8 +122,15 @@ where
|
||||
// compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
|
||||
let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
|
||||
|
||||
// Place the pivot between the two partitions.
|
||||
v.swap(0, num_lt);
|
||||
if num_lt >= len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// SAFETY: We checked that `num_lt` is in-bounds.
|
||||
unsafe {
|
||||
// Place the pivot between the two partitions.
|
||||
v.swap_unchecked(0, num_lt);
|
||||
}
|
||||
|
||||
num_lt
|
||||
}
|
||||
@ -129,7 +140,13 @@ const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut
|
||||
if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
|
||||
// Specialize for types that are relatively cheap to copy, where branchless optimizations
|
||||
// have large leverage e.g. `u64` and `String`.
|
||||
partition_lomuto_branchless_cyclic::<T, F>
|
||||
cfg_if! {
|
||||
if #[cfg(feature = "optimize_for_size")] {
|
||||
partition_lomuto_branchless_simple::<T, F>
|
||||
} else {
|
||||
partition_lomuto_branchless_cyclic::<T, F>
|
||||
}
|
||||
}
|
||||
} else {
|
||||
partition_hoare_branchy_cyclic::<T, F>
|
||||
}
|
||||
@ -215,6 +232,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
struct PartitionState<T> {
|
||||
// The current element that is being looked at, scans left to right through slice.
|
||||
right: *mut T,
|
||||
@ -225,6 +243,7 @@ struct PartitionState<T> {
|
||||
gap: GapGuardRaw<T>,
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
@ -316,6 +335,27 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "optimize_for_size")]
|
||||
fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
pivot: &T,
|
||||
is_less: &mut F,
|
||||
) -> usize {
|
||||
let mut left = 0;
|
||||
|
||||
for right in 0..v.len() {
|
||||
// SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
|
||||
// left <= right and that both are in-bounds.
|
||||
unsafe {
|
||||
let right_is_lt = is_less(v.get_unchecked(right), pivot);
|
||||
v.swap_unchecked(left, right);
|
||||
left += right_is_lt as usize;
|
||||
}
|
||||
}
|
||||
|
||||
left
|
||||
}
|
||||
|
||||
struct GapGuard<T> {
|
||||
pos: *mut T,
|
||||
value: ManuallyDrop<T>,
|
||||
@ -333,11 +373,13 @@ impl<T> Drop for GapGuard<T> {
|
||||
|
||||
/// Ideally this wouldn't be needed and we could just use the regular GapGuard.
|
||||
/// See comment in [`partition_lomuto_branchless_cyclic`].
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
struct GapGuardRaw<T> {
|
||||
pos: *mut T,
|
||||
value: *mut T,
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "optimize_for_size"))]
|
||||
impl<T> Drop for GapGuardRaw<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: `self` MUST be constructed in a way that makes copying the gap value into
|
||||
|
Loading…
Reference in New Issue
Block a user