mirror of
https://github.com/rust-lang/rust.git
synced 2025-02-20 10:55:14 +00:00
Replace sort implementations
- `slice::sort` -> driftsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md - `slice::sort_unstable` -> ipnsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md Replaces the sort implementations with tailor made ones that strike a balance of run-time, compile-time and binary-size, yielding run-time and compile-time improvements. Regressing binary-size for `slice::sort` while improving it for `slice::sort_unstable`. All while upholding the existing soft and hard safety guarantees, and even extending the soft guarantees, detecting strict weak ordering violations with a high chance and reporting it to users via a panic. In addition the implementation of `select_nth_unstable` is also adapted as it uses `slice::sort_unstable` internals.
This commit is contained in:
parent
4a78c00e22
commit
e49be415cd
@ -16,7 +16,7 @@ use core::borrow::{Borrow, BorrowMut};
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::cmp::Ordering::{self, Less};
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::mem::{self, SizedTypeProperties};
|
||||
use core::mem::{self, MaybeUninit};
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::ptr;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
@ -24,7 +24,7 @@ use core::slice::sort;
|
||||
|
||||
use crate::alloc::Allocator;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use crate::alloc::{self, Global};
|
||||
use crate::alloc::Global;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use crate::borrow::ToOwned;
|
||||
use crate::boxed::Box;
|
||||
@ -174,23 +174,32 @@ pub(crate) mod hack {
|
||||
|
||||
#[cfg(not(test))]
|
||||
impl<T> [T] {
|
||||
/// Sorts the slice.
|
||||
/// Sorts the slice, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
|
||||
/// worst-case.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order the resulting order is unspecified. All
|
||||
/// original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `T: Ord` panics.
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable`](slice::sort_unstable).
|
||||
/// sorting and it doesn't allocate auxiliary memory. See
|
||||
/// [`sort_unstable`](slice::sort_unstable). The exception are partially sorted slices, which
|
||||
/// may be better served with `slice::sort`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -200,6 +209,8 @@ impl<T> [T] {
|
||||
/// v.sort();
|
||||
/// assert!(v == [-5, -3, 1, 2, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
@ -211,13 +222,18 @@ impl<T> [T] {
|
||||
stable_sort(self, T::lt);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a comparator function.
|
||||
/// Sorts the slice with a comparator function, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
|
||||
/// worst-case.
|
||||
///
|
||||
/// The comparator function must define a total ordering for the elements in the slice. If
|
||||
/// the ordering is not total, the order of the elements is unspecified. An order is a
|
||||
/// total order if it is (for all `a`, `b` and `c`):
|
||||
/// The comparator function should define a total ordering for the elements in the slice. If the
|
||||
/// ordering is not total, the order of the elements is unspecified.
|
||||
///
|
||||
/// If the comparator function does not implement a total order the resulting order is
|
||||
/// unspecified. All original elements will remain in the slice and any possible modifications
|
||||
/// via interior mutability are observed in the input. Same is true if the comparator function
|
||||
/// panics. A total order (for all `a`, `b` and `c`):
|
||||
///
|
||||
/// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
|
||||
/// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
|
||||
@ -227,23 +243,22 @@ impl<T> [T] {
|
||||
///
|
||||
/// ```
|
||||
/// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
|
||||
/// floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
/// floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
/// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
|
||||
/// ```
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable_by`](slice::sort_unstable_by).
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -256,6 +271,8 @@ impl<T> [T] {
|
||||
/// v.sort_by(|a, b| b.cmp(a));
|
||||
/// assert!(v == [5, 4, 3, 2, 1]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
@ -267,28 +284,27 @@ impl<T> [T] {
|
||||
stable_sort(self, |a, b| compare(a, b) == Less);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function.
|
||||
/// Sorts the slice with a key extraction function, preserving initial order of equal elements.
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
|
||||
/// worst-case, where the key function is *O*(*m*).
|
||||
///
|
||||
/// For expensive key functions (e.g. functions that are not simple property accesses or
|
||||
/// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be
|
||||
/// significantly faster, as it does not recompute element keys.
|
||||
///
|
||||
/// When applicable, unstable sorting is preferred because it is generally faster than stable
|
||||
/// sorting and it doesn't allocate auxiliary memory.
|
||||
/// See [`sort_unstable_by_key`](slice::sort_unstable_by_key).
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an adaptive, iterative merge sort inspired by
|
||||
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
|
||||
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
|
||||
/// two or more sorted sequences concatenated one after another.
|
||||
/// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
|
||||
/// combines the fast average case of quicksort with the fast worst case and partial run
|
||||
/// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
|
||||
/// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
|
||||
/// non-allocating insertion sort is used instead.
|
||||
/// The auxiliary memory allocation behavior depends on the input length. Short slices are
|
||||
/// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
|
||||
/// clamps at `self.len() / 2`.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -298,6 +314,8 @@ impl<T> [T] {
|
||||
/// v.sort_by_key(|k| k.abs());
|
||||
/// assert!(v == [1, 2, -3, 4, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [driftsort]: https://github.com/Voultapher/driftsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "slice_sort_by_key", since = "1.7.0")]
|
||||
@ -310,27 +328,30 @@ impl<T> [T] {
|
||||
stable_sort(self, |a, b| f(a).lt(&f(b)));
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function.
|
||||
/// Sorts the slice with a key extraction function, preserving initial order of equal elements.
|
||||
///
|
||||
/// During sorting, the key function is called at most once per element, by using
|
||||
/// temporary storage to remember the results of key evaluation.
|
||||
/// The order of calls to the key function is unspecified and may change in future versions
|
||||
/// of the standard library.
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \*
|
||||
/// log(*n*)) worst-case, where the key function is *O*(*m*).
|
||||
///
|
||||
/// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
|
||||
/// worst-case, where the key function is *O*(*m*).
|
||||
/// During sorting, the key function is called at most once per element, by using temporary
|
||||
/// storage to remember the results of key evaluation. The order of calls to the key function is
|
||||
/// unspecified and may change in future versions of the standard library.
|
||||
///
|
||||
/// For simple key functions (e.g., functions that are property accesses or
|
||||
/// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be
|
||||
/// faster.
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// For simple key functions (e.g., functions that are property accesses or basic operations),
|
||||
/// [`sort_by_key`](slice::sort_by_key) is likely to be faster.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [instruction-parallel-network sort][ipnsort] by Lukas
|
||||
/// Bergdoll, which combines the fast average case of randomized quicksort with the fast worst
|
||||
/// case of heapsort, while achieving linear time on fully sorted and reversed inputs. And
|
||||
/// *O*(*k* \* log(*n*)) where *k* is the number of distinct elements in the input. It leverages
|
||||
/// superscalar out-of-order execution capabilities commonly found in CPUs, to efficiently
|
||||
/// perform the operation.
|
||||
///
|
||||
/// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
|
||||
/// length of the slice.
|
||||
@ -344,7 +365,7 @@ impl<T> [T] {
|
||||
/// assert!(v == [-3, -5, 2, 32, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[rustc_allow_incoherent_impl]
|
||||
#[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")]
|
||||
@ -361,7 +382,7 @@ impl<T> [T] {
|
||||
$slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
|
||||
// The elements of `indices` are unique, as they are indexed, so any sort will be
|
||||
// stable with respect to the original slice. We use `sort_unstable` here because
|
||||
// it requires less memory allocation.
|
||||
// it requires no memory allocation.
|
||||
indices.sort_unstable();
|
||||
for i in 0..$slice.len() {
|
||||
let mut index = indices[i].1;
|
||||
@ -374,24 +395,24 @@ impl<T> [T] {
|
||||
}};
|
||||
}
|
||||
|
||||
let sz_u8 = mem::size_of::<(K, u8)>();
|
||||
let sz_u16 = mem::size_of::<(K, u16)>();
|
||||
let sz_u32 = mem::size_of::<(K, u32)>();
|
||||
let sz_usize = mem::size_of::<(K, usize)>();
|
||||
|
||||
let len = self.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
if sz_u8 < sz_u16 && len <= (u8::MAX as usize) {
|
||||
return sort_by_key!(u8, self, f);
|
||||
}
|
||||
if sz_u16 < sz_u32 && len <= (u16::MAX as usize) {
|
||||
return sort_by_key!(u16, self, f);
|
||||
}
|
||||
if sz_u32 < sz_usize && len <= (u32::MAX as usize) {
|
||||
|
||||
// Avoids binary-size usage in cases where the alignment doesn't work out to make this
|
||||
// beneficial or on 32-bit platforms.
|
||||
let is_using_u32_as_idx_type_helpful =
|
||||
const { mem::size_of::<(K, u32)>() < mem::size_of::<(K, usize)>() };
|
||||
|
||||
// It's possible to instantiate this for u8 and u16 but, doing so is very wasteful in terms
|
||||
// of compile-times and binary-size, the peak saved heap memory for u16 is (u8 + u16) -> 4
|
||||
// bytes * u16::MAX vs (u8 + u32) -> 8 bytes * u16::MAX, the saved heap memory is at peak
|
||||
// ~262KB.
|
||||
if is_using_u32_as_idx_type_helpful && len <= (u32::MAX as usize) {
|
||||
return sort_by_key!(u32, self, f);
|
||||
}
|
||||
|
||||
sort_by_key!(usize, self, f)
|
||||
}
|
||||
|
||||
@ -843,46 +864,18 @@ fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
return;
|
||||
use sort::stable::BufGuard;
|
||||
|
||||
#[unstable(issue = "none", feature = "std_internals")]
|
||||
impl<T> BufGuard<T> for Vec<T> {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Vec::with_capacity(capacity)
|
||||
}
|
||||
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
|
||||
self.spare_capacity_mut()
|
||||
}
|
||||
}
|
||||
|
||||
let elem_alloc_fn = |len: usize| -> *mut T {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
|
||||
// v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
|
||||
// elements.
|
||||
unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
|
||||
};
|
||||
|
||||
let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
|
||||
// v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
|
||||
// len.
|
||||
unsafe {
|
||||
alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
|
||||
}
|
||||
};
|
||||
|
||||
let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
|
||||
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
|
||||
// obscene length or 0.
|
||||
unsafe {
|
||||
alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
|
||||
as *mut sort::TimSortRun
|
||||
}
|
||||
};
|
||||
|
||||
let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
|
||||
// SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
|
||||
// len.
|
||||
unsafe {
|
||||
alloc::dealloc(
|
||||
buf_ptr as *mut u8,
|
||||
alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
|
||||
sort::stable::sort::<T, F, Vec<T>>(v, &mut is_less);
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ macro_rules! do_test {
|
||||
}
|
||||
|
||||
let v = $input.to_owned();
|
||||
let _ = std::panic::catch_unwind(move || {
|
||||
let _ = panic::catch_unwind(move || {
|
||||
let mut v = v;
|
||||
let mut panic_countdown = panic_countdown;
|
||||
v.$func(|a, b| {
|
||||
@ -197,8 +197,7 @@ fn panic_safe() {
|
||||
|
||||
let mut rng = test_rng();
|
||||
|
||||
// Miri is too slow (but still need to `chain` to make the types match)
|
||||
let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) };
|
||||
let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) };
|
||||
let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] };
|
||||
|
||||
for len in lens {
|
||||
@ -294,15 +293,20 @@ fn test_sort() {
|
||||
}
|
||||
}
|
||||
|
||||
// Sort using a completely random comparison function.
|
||||
// This will reorder the elements *somehow*, but won't panic.
|
||||
let mut v = [0; 500];
|
||||
for i in 0..v.len() {
|
||||
const ORD_VIOLATION_MAX_LEN: usize = 500;
|
||||
let mut v = [0; ORD_VIOLATION_MAX_LEN];
|
||||
for i in 0..ORD_VIOLATION_MAX_LEN {
|
||||
v[i] = i as i32;
|
||||
}
|
||||
v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
|
||||
// Sort using a completely random comparison function. This will reorder the elements *somehow*,
|
||||
// it may panic but the original elements must still be present.
|
||||
let _ = panic::catch_unwind(move || {
|
||||
v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
});
|
||||
|
||||
v.sort();
|
||||
for i in 0..v.len() {
|
||||
for i in 0..ORD_VIOLATION_MAX_LEN {
|
||||
assert_eq!(v[i], i as i32);
|
||||
}
|
||||
|
||||
|
@ -39,7 +39,6 @@ pub(crate) mod index;
|
||||
mod iter;
|
||||
mod raw;
|
||||
mod rotate;
|
||||
mod select;
|
||||
mod specialize;
|
||||
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
@ -83,10 +82,6 @@ pub use raw::{from_mut, from_ref};
|
||||
#[unstable(feature = "slice_from_ptr_range", issue = "89792")]
|
||||
pub use raw::{from_mut_ptr_range, from_ptr_range};
|
||||
|
||||
// This function is public only because there is no other way to unit test heapsort.
|
||||
#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
|
||||
pub use sort::heapsort;
|
||||
|
||||
#[stable(feature = "slice_get_slice", since = "1.28.0")]
|
||||
pub use index::SliceIndex;
|
||||
|
||||
@ -2884,21 +2879,26 @@ impl<T> [T] {
|
||||
self.binary_search_by(|k| f(k).cmp(b))
|
||||
}
|
||||
|
||||
/// Sorts the slice, but might not preserve the order of equal elements.
|
||||
/// Sorts the slice **without** preserving the initial order of equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order the resulting order is unspecified. All
|
||||
/// original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `T: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice consists of several concatenated sorted sequences.
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -2909,25 +2909,29 @@ impl<T> [T] {
|
||||
/// assert!(v == [-5, -3, 1, 2, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable(&mut self)
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
sort::quicksort(self, T::lt);
|
||||
sort::unstable::sort(self, &mut T::lt);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a comparator function, but might not preserve the order of equal
|
||||
/// elements.
|
||||
/// Sorts the slice with a comparator function, **without** preserving the initial order of
|
||||
/// equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// The comparator function must define a total ordering for the elements in the slice. If
|
||||
/// the ordering is not total, the order of the elements is unspecified. An order is a
|
||||
/// total order if it is (for all `a`, `b` and `c`):
|
||||
/// The comparator function should define a total ordering for the elements in the slice. If the
|
||||
/// ordering is not total, the order of the elements is unspecified.
|
||||
///
|
||||
/// If the comparator function does not implement a total order the resulting order is
|
||||
/// unspecified. All original elements will remain in the slice and any possible modifications
|
||||
/// via interior mutability are observed in the input. Same is true if the comparator function
|
||||
/// panics. A total order (for all `a`, `b` and `c`):
|
||||
///
|
||||
/// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
|
||||
/// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
|
||||
@ -2943,14 +2947,15 @@ impl<T> [T] {
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice consists of several concatenated sorted sequences.
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `T: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -2964,34 +2969,37 @@ impl<T> [T] {
|
||||
/// assert!(v == [5, 4, 3, 2, 1]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable_by<F>(&mut self, mut compare: F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> Ordering,
|
||||
{
|
||||
sort::quicksort(self, |a, b| compare(a, b) == Ordering::Less);
|
||||
sort::unstable::sort(self, &mut |a, b| compare(a, b) == Ordering::Less);
|
||||
}
|
||||
|
||||
/// Sorts the slice with a key extraction function, but might not preserve the order of equal
|
||||
/// elements.
|
||||
/// Sorts the slice with a key extraction function, **without** preserving the initial order of
|
||||
/// equal elements.
|
||||
///
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place
|
||||
/// (i.e., does not allocate), and *O*(*m* \* *n* \* log(*n*)) worst-case, where the key function is
|
||||
/// *O*(*m*).
|
||||
/// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
|
||||
/// allocate), and *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order the resulting order is unspecified.
|
||||
/// All original elements will remain in the slice and any possible modifications via interior
|
||||
/// mutability are observed in the input. Same is true if `K: Ord` panics.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
|
||||
/// which combines the fast average case of randomized quicksort with the fast worst case of
|
||||
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
|
||||
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
|
||||
/// deterministic behavior.
|
||||
/// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
|
||||
/// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
|
||||
/// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
|
||||
/// expected time to sort the data is *O(*n* log(*k*))*.
|
||||
///
|
||||
/// Due to its key calling strategy, [`sort_unstable_by_key`](#method.sort_unstable_by_key)
|
||||
/// is likely to be slower than [`sort_by_cached_key`](#method.sort_by_cached_key) in
|
||||
/// cases where the key function is expensive.
|
||||
/// It is typically faster than stable sorting, except in a few special cases, e.g., when the
|
||||
/// slice is partially sorted.
|
||||
///
|
||||
/// If `K: Ord` does not implement a total order, the implementation may panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -3002,7 +3010,7 @@ impl<T> [T] {
|
||||
/// assert!(v == [1, 2, -3, 4, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [pdqsort]: https://github.com/orlp/pdqsort
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "sort_unstable", since = "1.20.0")]
|
||||
#[inline]
|
||||
pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
|
||||
@ -3010,27 +3018,32 @@ impl<T> [T] {
|
||||
F: FnMut(&T) -> K,
|
||||
K: Ord,
|
||||
{
|
||||
sort::quicksort(self, |a, b| f(a).lt(&f(b)));
|
||||
sort::unstable::sort(self, &mut |a, b| f(a).lt(&f(b)));
|
||||
}
|
||||
|
||||
/// Reorder the slice such that the element at `index` after the reordering is at its final sorted position.
|
||||
/// Reorder the slice such that the element at `index` after the reordering is at its final
|
||||
/// sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index`. Additionally, this reordering is
|
||||
/// unstable (i.e. any number of equal elements may end up at position `index`), in-place
|
||||
/// (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// unstable (i.e. any number of equal elements may end up at position `index`), in-place (i.e.
|
||||
/// does not allocate), and runs in *O*(*n*) time. This function is also known as "kth element"
|
||||
/// in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from the reordered slice:
|
||||
/// the subslice prior to `index`, the element at `index`, and the subslice after `index`;
|
||||
/// accordingly, the values in those two subslices will respectively all be less-than-or-equal-to
|
||||
/// and greater-than-or-equal-to the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the reordered slice: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and
|
||||
/// greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@ -3058,35 +3071,40 @@ impl<T> [T] {
|
||||
/// v == [-3, -5, 1, 4, 2] ||
|
||||
/// v == [-5, -3, 1, 4, 2]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
select::partition_at_index(self, index, T::lt)
|
||||
sort::select::partition_at_index(self, index, T::lt)
|
||||
}
|
||||
|
||||
/// Reorder the slice with a comparator function such that the element at `index` after the reordering is at
|
||||
/// its final sorted position.
|
||||
/// Reorder the slice with a comparator function such that the element at `index` after the
|
||||
/// reordering is at its final sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index` using the comparator function.
|
||||
/// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
|
||||
/// function is also known as "kth element" in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from
|
||||
/// the slice reordered according to the provided comparator function: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
|
||||
/// the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the slice reordered according to the provided
|
||||
/// comparator function: the subslice prior to `index`, the element at `index`, and the subslice
|
||||
/// after `index`; accordingly, the values in those two subslices will respectively all be
|
||||
/// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@ -3114,6 +3132,8 @@ impl<T> [T] {
|
||||
/// v == [4, 2, 1, -5, -3] ||
|
||||
/// v == [4, 2, 1, -3, -5]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable_by<F>(
|
||||
@ -3124,29 +3144,32 @@ impl<T> [T] {
|
||||
where
|
||||
F: FnMut(&T, &T) -> Ordering,
|
||||
{
|
||||
select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
|
||||
sort::select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
|
||||
}
|
||||
|
||||
/// Reorder the slice with a key extraction function such that the element at `index` after the reordering is
|
||||
/// at its final sorted position.
|
||||
/// Reorder the slice with a key extraction function such that the element at `index` after the
|
||||
/// reordering is at its final sorted position.
|
||||
///
|
||||
/// This reordering has the additional property that any value at position `i < index` will be
|
||||
/// less than or equal to any value at a position `j > index` using the key extraction function.
|
||||
/// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
|
||||
/// This function is also known as "kth element" in other libraries.
|
||||
/// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
|
||||
/// function is also known as "kth element" in other libraries.
|
||||
///
|
||||
/// It returns a triplet of the following from
|
||||
/// the slice reordered according to the provided key extraction function: the subslice prior to
|
||||
/// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
|
||||
/// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
|
||||
/// the value of the element at `index`.
|
||||
/// It returns a triplet of the following from the slice reordered according to the provided key
|
||||
/// extraction function: the subslice prior to `index`, the element at `index`, and the subslice
|
||||
/// after `index`; accordingly, the values in those two subslices will respectively all be
|
||||
/// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
|
||||
/// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
|
||||
/// pivot selection, which guarantees linear runtime for all inputs.
|
||||
/// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
|
||||
/// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
|
||||
/// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
|
||||
/// for all inputs.
|
||||
///
|
||||
/// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
|
||||
/// nearly fully sorted, where [`slice::sort`] may be faster.
|
||||
///
|
||||
/// [`sort_unstable`]: slice::sort_unstable
|
||||
///
|
||||
@ -3174,6 +3197,8 @@ impl<T> [T] {
|
||||
/// v == [2, 1, -3, 4, -5] ||
|
||||
/// v == [2, 1, -3, -5, 4]);
|
||||
/// ```
|
||||
///
|
||||
/// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
|
||||
#[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
|
||||
#[inline]
|
||||
pub fn select_nth_unstable_by_key<K, F>(
|
||||
@ -3185,7 +3210,7 @@ impl<T> [T] {
|
||||
F: FnMut(&T) -> K,
|
||||
K: Ord,
|
||||
{
|
||||
select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
|
||||
sort::select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
|
||||
}
|
||||
|
||||
/// Moves all consecutive repeated elements to the end of the slice according to the
|
||||
|
File diff suppressed because it is too large
Load Diff
8
library/core/src/slice/sort/mod.rs
Normal file
8
library/core/src/slice/sort/mod.rs
Normal file
@ -0,0 +1,8 @@
|
||||
//! This module and the contained sub-modules contains the code for efficient and robust sort
|
||||
//! implementations, as well as the domain adjacent implementation of `select_nth_unstable`.
|
||||
|
||||
pub mod stable;
|
||||
pub mod unstable;
|
||||
|
||||
pub(crate) mod select;
|
||||
pub(crate) mod shared;
|
@ -1,45 +1,78 @@
|
||||
//! Slice selection
|
||||
//!
|
||||
//! This module contains the implementation for `slice::select_nth_unstable`.
|
||||
//! It uses an introselect algorithm based on Orson Peters' pattern-defeating quicksort,
|
||||
//! published at: <https://github.com/orlp/pdqsort>
|
||||
//! It uses an introselect algorithm based on ipnsort by Lukas Bergdoll and Orson Peters,
|
||||
//! published at: <https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort>
|
||||
//!
|
||||
//! The fallback algorithm used for introselect is Median of Medians using Tukey's Ninther
|
||||
//! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
|
||||
//! better performance than one would get using heapsort as fallback.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::mem::{self, SizedTypeProperties};
|
||||
use crate::slice::sort::{
|
||||
break_patterns, choose_pivot, insertion_sort_shift_left, partition, partition_equal,
|
||||
};
|
||||
|
||||
// For slices of up to this length it's probably faster to simply sort them.
|
||||
// Defined at the module scope because it's used in multiple functions.
|
||||
const MAX_INSERTION: usize = 10;
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
use crate::slice::sort::unstable::quicksort::partition;
|
||||
|
||||
/// Reorder the slice such that the element at `index` is at its final sorted position.
|
||||
pub(crate) fn partition_at_index<T, F>(
|
||||
v: &mut [T],
|
||||
index: usize,
|
||||
mut is_less: F,
|
||||
) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Puts a lower limit of 1 on `len`.
|
||||
if index >= len {
|
||||
panic!("partition_at_index index {} greater than length of slice {}", index, len);
|
||||
}
|
||||
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
} else if index == len - 1 {
|
||||
// Find max element and place it in the last position of the array. We're free to use
|
||||
// `unwrap()` here because we checked that `v` is not empty.
|
||||
let max_idx = max_index(v, &mut is_less).unwrap();
|
||||
v.swap(max_idx, index);
|
||||
} else if index == 0 {
|
||||
// Find min element and place it in the first position of the array. We're free to use
|
||||
// `unwrap()` here because we checked that `v` is not empty.
|
||||
let min_idx = min_index(v, &mut is_less).unwrap();
|
||||
v.swap(min_idx, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, None, &mut is_less);
|
||||
}
|
||||
|
||||
let (left, right) = v.split_at_mut(index);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
(left, pivot, right)
|
||||
}
|
||||
|
||||
// For small sub-slices it's faster to use a dedicated small-sort, but because it is only called at
|
||||
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
|
||||
const INSERTION_SORT_THRESHOLD: usize = 16;
|
||||
|
||||
fn partition_at_index_loop<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut index: usize,
|
||||
mut ancestor_pivot: Option<&'a T>,
|
||||
is_less: &mut F,
|
||||
mut pred: Option<&'a T>,
|
||||
) where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Limit the amount of iterations and fall back to fast deterministic selection
|
||||
// to ensure O(n) worst case running time. This limit needs to be constant, because
|
||||
// using `ilog2(len)` like in `sort` would result in O(n log n) time complexity.
|
||||
// The exact value of the limit is chosen somewhat arbitrarily, but for most inputs bad pivot
|
||||
// selections should be relatively rare, so the limit usually shouldn't be reached
|
||||
// anyways.
|
||||
// Limit the amount of iterations and fall back to fast deterministic selection to ensure O(n)
|
||||
// worst case running time. This limit needs to be constant, because using `ilog2(len)` like in
|
||||
// `sort` would result in O(n log n) time complexity. The exact value of the limit is chosen
|
||||
// somewhat arbitrarily, but for most inputs bad pivot selections should be relatively rare, so
|
||||
// the limit is reached for sub-slices len / (2^limit or less). Which makes the remaining work
|
||||
// with the fallback minimal in relative terms.
|
||||
let mut limit = 16;
|
||||
|
||||
// True if the last partitioning was reasonably balanced.
|
||||
let mut was_balanced = true;
|
||||
|
||||
loop {
|
||||
if v.len() <= MAX_INSERTION {
|
||||
if v.len() > 1 {
|
||||
if v.len() <= INSERTION_SORT_THRESHOLD {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
@ -50,38 +83,35 @@ fn partition_at_index_loop<'a, T, F>(
|
||||
return;
|
||||
}
|
||||
|
||||
// If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
|
||||
// some elements around. Hopefully we'll choose a better pivot this time.
|
||||
if !was_balanced {
|
||||
break_patterns(v);
|
||||
limit -= 1;
|
||||
}
|
||||
limit -= 1;
|
||||
|
||||
// Choose a pivot
|
||||
let (pivot, _) = choose_pivot(v, is_less);
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
|
||||
// If the chosen pivot is equal to the predecessor, then it's the smallest element in the
|
||||
// slice. Partition the slice into elements equal to and elements greater than the pivot.
|
||||
// This case is usually hit when the slice contains many duplicate elements.
|
||||
if let Some(p) = pred {
|
||||
if !is_less(p, &v[pivot]) {
|
||||
let mid = partition_equal(v, pivot, is_less);
|
||||
if let Some(p) = ancestor_pivot {
|
||||
if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
|
||||
let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
|
||||
|
||||
// Continue sorting elements greater than the pivot. We know that `mid` contains
|
||||
// the pivot. So we can continue after `mid`.
|
||||
let mid = num_lt + 1;
|
||||
|
||||
// If we've passed our index, then we're good.
|
||||
if mid > index {
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, continue sorting elements greater than the pivot.
|
||||
v = &mut v[mid..];
|
||||
index = index - mid;
|
||||
pred = None;
|
||||
ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let (mid, _) = partition(v, pivot, is_less);
|
||||
was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8;
|
||||
let mid = partition(v, pivot_pos, is_less);
|
||||
|
||||
// Split the slice into `left`, `pivot`, and `right`.
|
||||
let (left, right) = v.split_at_mut(mid);
|
||||
@ -91,7 +121,7 @@ fn partition_at_index_loop<'a, T, F>(
|
||||
if mid < index {
|
||||
v = right;
|
||||
index = index - mid - 1;
|
||||
pred = Some(pivot);
|
||||
ancestor_pivot = Some(pivot);
|
||||
} else if mid > index {
|
||||
v = left;
|
||||
} else {
|
||||
@ -122,41 +152,6 @@ fn max_index<T, F: FnMut(&T, &T) -> bool>(slice: &[T], is_less: &mut F) -> Optio
|
||||
.map(|(i, _)| i)
|
||||
}
|
||||
|
||||
/// Reorder the slice such that the element at `index` is at its final sorted position.
|
||||
pub fn partition_at_index<T, F>(
|
||||
v: &mut [T],
|
||||
index: usize,
|
||||
mut is_less: F,
|
||||
) -> (&mut [T], &mut T, &mut [T])
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if index >= v.len() {
|
||||
panic!("partition_at_index index {} greater than length of slice {}", index, v.len());
|
||||
}
|
||||
|
||||
if T::IS_ZST {
|
||||
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
|
||||
} else if index == v.len() - 1 {
|
||||
// Find max element and place it in the last position of the array. We're free to use
|
||||
// `unwrap()` here because we know v must not be empty.
|
||||
let max_idx = max_index(v, &mut is_less).unwrap();
|
||||
v.swap(max_idx, index);
|
||||
} else if index == 0 {
|
||||
// Find min element and place it in the first position of the array. We're free to use
|
||||
// `unwrap()` here because we know v must not be empty.
|
||||
let min_idx = min_index(v, &mut is_less).unwrap();
|
||||
v.swap(min_idx, index);
|
||||
} else {
|
||||
partition_at_index_loop(v, index, &mut is_less, None);
|
||||
}
|
||||
|
||||
let (left, right) = v.split_at_mut(index);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &mut pivot[0];
|
||||
(left, pivot, right)
|
||||
}
|
||||
|
||||
/// Selection algorithm to select the k-th element from the slice in guaranteed O(n) time.
|
||||
/// This is essentially a quickselect that uses Tukey's Ninther for pivot selection
|
||||
fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut F, mut k: usize) {
|
||||
@ -168,8 +163,8 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
|
||||
|
||||
// We now know that `k < v.len() <= isize::MAX`
|
||||
loop {
|
||||
if v.len() <= MAX_INSERTION {
|
||||
if v.len() > 1 {
|
||||
if v.len() <= INSERTION_SORT_THRESHOLD {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
return;
|
||||
@ -232,7 +227,8 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
|
||||
}
|
||||
|
||||
median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
|
||||
partition(v, lo + pivot, is_less).0
|
||||
|
||||
partition(v, lo + pivot, is_less)
|
||||
}
|
||||
|
||||
/// Moves around the 9 elements at the indices a..i, such that
|
45
library/core/src/slice/sort/shared/mod.rs
Normal file
45
library/core/src/slice/sort/shared/mod.rs
Normal file
@ -0,0 +1,45 @@
|
||||
use crate::marker::Freeze;
|
||||
|
||||
pub(crate) mod pivot;
|
||||
pub(crate) mod smallsort;
|
||||
|
||||
/// SAFETY: this is safety relevant, how does this interact with the soundness holes in
|
||||
/// specialization?
|
||||
#[rustc_unsafe_specialization_marker]
|
||||
pub(crate) trait FreezeMarker {}
|
||||
|
||||
impl<T: Freeze> FreezeMarker for T {}
|
||||
|
||||
/// Finds a run of sorted elements starting at the beginning of the slice.
|
||||
///
|
||||
/// Returns the length of the run, and a bool that is false when the run
|
||||
/// is ascending, and true if the run strictly descending.
|
||||
#[inline(always)]
|
||||
pub(crate) fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &[T],
|
||||
is_less: &mut F,
|
||||
) -> (usize, bool) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return (len, false);
|
||||
}
|
||||
|
||||
// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
|
||||
// This also means that run_len < len implies run_len and run_len - 1
|
||||
// are valid indices as well.
|
||||
unsafe {
|
||||
let mut run_len = 2;
|
||||
let strictly_descending = is_less(v.get_unchecked(1), v.get_unchecked(0));
|
||||
if strictly_descending {
|
||||
while run_len < len && is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) {
|
||||
run_len += 1;
|
||||
}
|
||||
} else {
|
||||
while run_len < len && !is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1))
|
||||
{
|
||||
run_len += 1;
|
||||
}
|
||||
}
|
||||
(run_len, strictly_descending)
|
||||
}
|
||||
}
|
88
library/core/src/slice/sort/shared/pivot.rs
Normal file
88
library/core/src/slice/sort/shared/pivot.rs
Normal file
@ -0,0 +1,88 @@
|
||||
//! This module contains the logic for pivot selection.
|
||||
|
||||
use crate::intrinsics;
|
||||
|
||||
// Recursively select a pseudomedian if above this threshold.
|
||||
const PSEUDO_MEDIAN_REC_THRESHOLD: usize = 64;
|
||||
|
||||
/// Selects a pivot from `v`. Algorithm taken from glidesort by Orson Peters.
|
||||
///
|
||||
/// This chooses a pivot by sampling an adaptive amount of points, approximating
|
||||
/// the quality of a median of sqrt(n) elements.
|
||||
pub fn choose_pivot<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> usize {
|
||||
// We use unsafe code and raw pointers here because we're dealing with
|
||||
// heavy recursion. Passing safe slices around would involve a lot of
|
||||
// branches and function call overhead.
|
||||
|
||||
let len = v.len();
|
||||
if len < 8 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// SAFETY: a, b, c point to initialized regions of len_div_8 elements,
|
||||
// satisfying median3 and median3_rec's preconditions as v_base points
|
||||
// to an initialized region of n = len elements.
|
||||
unsafe {
|
||||
let v_base = v.as_ptr();
|
||||
let len_div_8 = len / 8;
|
||||
|
||||
let a = v_base; // [0, floor(n/8))
|
||||
let b = v_base.add(len_div_8 * 4); // [4*floor(n/8), 5*floor(n/8))
|
||||
let c = v_base.add(len_div_8 * 7); // [7*floor(n/8), 8*floor(n/8))
|
||||
|
||||
if len < PSEUDO_MEDIAN_REC_THRESHOLD {
|
||||
median3(&*a, &*b, &*c, is_less).sub_ptr(v_base)
|
||||
} else {
|
||||
median3_rec(a, b, c, len_div_8, is_less).sub_ptr(v_base)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates an approximate median of 3 elements from sections a, b, c, or
|
||||
/// recursively from an approximation of each, if they're large enough. By
|
||||
/// dividing the size of each section by 8 when recursing we have logarithmic
|
||||
/// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
|
||||
/// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
|
||||
///
|
||||
/// SAFETY: a, b, c must point to the start of initialized regions of memory of
|
||||
/// at least n elements.
|
||||
unsafe fn median3_rec<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut a: *const T,
|
||||
mut b: *const T,
|
||||
mut c: *const T,
|
||||
n: usize,
|
||||
is_less: &mut F,
|
||||
) -> *const T {
|
||||
// SAFETY: a, b, c still point to initialized regions of n / 8 elements,
|
||||
// by the exact same logic as in choose_pivot.
|
||||
unsafe {
|
||||
if n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD {
|
||||
let n8 = n / 8;
|
||||
a = median3_rec(a, a.add(n8 * 4), a.add(n8 * 7), n8, is_less);
|
||||
b = median3_rec(b, b.add(n8 * 4), b.add(n8 * 7), n8, is_less);
|
||||
c = median3_rec(c, c.add(n8 * 4), c.add(n8 * 7), n8, is_less);
|
||||
}
|
||||
median3(&*a, &*b, &*c, is_less)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates the median of 3 elements.
|
||||
///
|
||||
/// SAFETY: a, b, c must be valid initialized elements.
|
||||
#[inline(always)]
|
||||
fn median3<T, F: FnMut(&T, &T) -> bool>(a: &T, b: &T, c: &T, is_less: &mut F) -> *const T {
|
||||
// Compiler tends to make this branchless when sensible, and avoids the
|
||||
// third comparison when not.
|
||||
let x = is_less(a, b);
|
||||
let y = is_less(a, c);
|
||||
if x == y {
|
||||
// If x=y=0 then b, c <= a. In this case we want to return max(b, c).
|
||||
// If x=y=1 then a < b, c. In this case we want to return min(b, c).
|
||||
// By toggling the outcome of b < c using XOR x we get this behavior.
|
||||
let z = is_less(b, c);
|
||||
if z ^ x { c } else { b }
|
||||
} else {
|
||||
// Either c <= a < b or b <= a < c, thus a is our median.
|
||||
a
|
||||
}
|
||||
}
|
843
library/core/src/slice/sort/shared/smallsort.rs
Normal file
843
library/core/src/slice/sort/shared/smallsort.rs
Normal file
@ -0,0 +1,843 @@
|
||||
//! This module contains a variety of sort implementations that are optimized for small lengths.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop, MaybeUninit};
|
||||
use crate::ptr;
|
||||
use crate::slice;
|
||||
|
||||
use crate::slice::sort::shared::FreezeMarker;
|
||||
|
||||
// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
|
||||
// small slices and small-sort performance sorting small sub-slices as part of
|
||||
// the main quicksort loop. For the former, testing showed that the
|
||||
// representative benchmarks for real-world performance are cold CPU state and
|
||||
// not single-size hot benchmarks. For the latter the CPU will call them many
|
||||
// times, so hot benchmarks are fine and more realistic. And it's worth it to
|
||||
// optimize sorting small sub-slices with more sophisticated solutions than
|
||||
// insertion sort.
|
||||
|
||||
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
|
||||
/// abstractions.
|
||||
pub(crate) trait StableSmallSortTypeImpl: Sized {
|
||||
/// For which input length <= return value of this function, is it valid to call `small_sort`.
|
||||
fn small_sort_threshold() -> usize;
|
||||
|
||||
/// Sorts `v` using strategies optimized for small sizes.
|
||||
fn small_sort<F: FnMut(&Self, &Self) -> bool>(
|
||||
v: &mut [Self],
|
||||
scratch: &mut [MaybeUninit<Self>],
|
||||
is_less: &mut F,
|
||||
);
|
||||
}
|
||||
|
||||
impl<T> StableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
default fn small_sort_threshold() -> usize {
|
||||
// Optimal number of comparisons, and good perf.
|
||||
SMALL_SORT_FALLBACK_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
default fn small_sort<F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
_scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
fn small_sort_threshold() -> usize {
|
||||
SMALL_SORT_GENERAL_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn small_sort<F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
small_sort_general_with_scratch(v, scratch, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
|
||||
/// abstractions.
|
||||
pub(crate) trait UnstableSmallSortTypeImpl: Sized {
|
||||
/// For which input length <= return value of this function, is it valid to call `small_sort`.
|
||||
fn small_sort_threshold() -> usize;
|
||||
|
||||
/// Sorts `v` using strategies optimized for small sizes.
|
||||
fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
|
||||
}
|
||||
|
||||
impl<T> UnstableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
default fn small_sort_threshold() -> usize {
|
||||
SMALL_SORT_FALLBACK_THRESHOLD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
default fn small_sort<F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
small_sort_fallback(v, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
|
||||
#[inline(always)]
|
||||
fn small_sort_threshold() -> usize {
|
||||
match const { choose_unstable_small_sort::<T>() } {
|
||||
UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD,
|
||||
UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD,
|
||||
UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn small_sort<F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// This construct is used to limit the LLVM IR generated, which saves large amounts of
|
||||
// compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
|
||||
(const { inst_unstable_small_sort::<T, F>() })(v, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Optimal number of comparisons, and good perf.
|
||||
const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
|
||||
|
||||
/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
|
||||
/// wall-clock performance choosing 32 yielded better performance overall.
|
||||
///
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
|
||||
const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
|
||||
|
||||
/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
|
||||
/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
|
||||
/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
|
||||
/// scratch buffer size.
|
||||
///
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
|
||||
pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
|
||||
|
||||
/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
|
||||
const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
|
||||
const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
|
||||
|
||||
/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
|
||||
/// conservative we limit the usage of small-sorts that require a stack array to types that fit
|
||||
/// within this limit.
|
||||
const MAX_STACK_ARRAY_SIZE: usize = 4096;
|
||||
|
||||
enum UnstalbeSmallSort {
|
||||
Fallback,
|
||||
General,
|
||||
Network,
|
||||
}
|
||||
|
||||
const fn choose_unstable_small_sort<T: FreezeMarker>() -> UnstalbeSmallSort {
|
||||
if T::is_copy()
|
||||
&& has_efficient_in_place_swap::<T>()
|
||||
&& (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
|
||||
{
|
||||
// Heuristic for int like types.
|
||||
return UnstalbeSmallSort::Network;
|
||||
}
|
||||
|
||||
if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
|
||||
return UnstalbeSmallSort::General;
|
||||
}
|
||||
|
||||
UnstalbeSmallSort::Fallback
|
||||
}
|
||||
|
||||
const fn inst_unstable_small_sort<T: FreezeMarker, F: FnMut(&T, &T) -> bool>()
|
||||
-> fn(&mut [T], &mut F) {
|
||||
match const { choose_unstable_small_sort::<T>() } {
|
||||
UnstalbeSmallSort::Fallback => small_sort_fallback::<T, F>,
|
||||
UnstalbeSmallSort::General => small_sort_general::<T, F>,
|
||||
UnstalbeSmallSort::Network => small_sort_network::<T, F>,
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
if v.len() >= 2 {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
|
||||
|
||||
let scratch = unsafe {
|
||||
slice::from_raw_parts_mut(
|
||||
stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
|
||||
SMALL_SORT_GENERAL_SCRATCH_LEN,
|
||||
)
|
||||
};
|
||||
|
||||
small_sort_general_with_scratch(v, scratch, is_less);
|
||||
}
|
||||
|
||||
fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
|
||||
if scratch.len() < len + 16 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let len_div_2 = len / 2;
|
||||
|
||||
// SAFETY: See individual comments.
|
||||
unsafe {
|
||||
let scratch_base = scratch.as_mut_ptr() as *mut T;
|
||||
|
||||
let presorted_len = if const { mem::size_of::<T>() <= 16 } && len >= 16 {
|
||||
// SAFETY: scratch_base is valid and has enough space.
|
||||
sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
|
||||
sort8_stable(
|
||||
v_base.add(len_div_2),
|
||||
scratch_base.add(len_div_2),
|
||||
scratch_base.add(len + 8),
|
||||
is_less,
|
||||
);
|
||||
|
||||
8
|
||||
} else if len >= 8 {
|
||||
// SAFETY: scratch_base is valid and has enough space.
|
||||
sort4_stable(v_base, scratch_base, is_less);
|
||||
sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
|
||||
|
||||
4
|
||||
} else {
|
||||
ptr::copy_nonoverlapping(v_base, scratch_base, 1);
|
||||
ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
|
||||
|
||||
1
|
||||
};
|
||||
|
||||
for offset in [0, len_div_2] {
|
||||
// SAFETY: at this point dst is initialized with presorted_len elements.
|
||||
// We extend this to desired_len, src is valid for desired_len elements.
|
||||
let src = v_base.add(offset);
|
||||
let dst = scratch_base.add(offset);
|
||||
let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
|
||||
|
||||
for i in presorted_len..desired_len {
|
||||
ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
|
||||
insert_tail(dst, dst.add(i), is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: see comment in `CopyOnDrop::drop`.
|
||||
let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
|
||||
|
||||
// SAFETY: at this point scratch_base is fully initialized, allowing us
|
||||
// to use it as the source of our merge back into the original array.
|
||||
// If a panic occurs we ensure the original array is restored to a valid
|
||||
// permutation of the input through drop_guard. This technique is similar
|
||||
// to ping-pong merging.
|
||||
bidirectional_merge(
|
||||
&*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
|
||||
drop_guard.dst,
|
||||
is_less,
|
||||
);
|
||||
mem::forget(drop_guard);
|
||||
}
|
||||
}
|
||||
|
||||
struct CopyOnDrop<T> {
|
||||
src: *const T,
|
||||
dst: *mut T,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<T> Drop for CopyOnDrop<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: `src` must contain `len` initialized elements, and dst must
|
||||
// be valid to write `len` elements.
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(self.src, self.dst, self.len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
T: FreezeMarker,
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// This implementation is tuned to be efficient for integer types.
|
||||
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return;
|
||||
}
|
||||
|
||||
if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
|
||||
|
||||
let len_div_2 = len / 2;
|
||||
let no_merge = len < 18;
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let initial_region_len = if no_merge { len } else { len_div_2 };
|
||||
// SAFETY: Both possible values of `initial_region_len` are in-bounds.
|
||||
let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
|
||||
|
||||
// Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
|
||||
loop {
|
||||
let presorted_len = if region.len() >= 13 {
|
||||
sort13_optimal(region, is_less);
|
||||
13
|
||||
} else if region.len() >= 9 {
|
||||
sort9_optimal(region, is_less);
|
||||
9
|
||||
} else {
|
||||
1
|
||||
};
|
||||
|
||||
insertion_sort_shift_left(region, presorted_len, is_less);
|
||||
|
||||
if no_merge {
|
||||
return;
|
||||
}
|
||||
|
||||
if region.as_ptr() != v_base {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
|
||||
region =
|
||||
unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
|
||||
}
|
||||
|
||||
// SAFETY: We checked that T is Freeze and thus observation safe.
|
||||
// Should is_less panic v was not modified in parity_merge and retains it's original input.
|
||||
// scratch and v must not alias and scratch has v.len() space.
|
||||
unsafe {
|
||||
let scratch_base = stack_array.as_mut_ptr() as *mut T;
|
||||
bidirectional_merge(
|
||||
&mut *ptr::slice_from_raw_parts_mut(v_base, len),
|
||||
scratch_base,
|
||||
is_less,
|
||||
);
|
||||
ptr::copy_nonoverlapping(scratch_base, v_base, len);
|
||||
}
|
||||
}
|
||||
|
||||
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
|
||||
/// value at position `b_pos` is less than the one at position `a_pos`.
|
||||
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
|
||||
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
|
||||
unsafe {
|
||||
let v_a = v_base.add(a_pos);
|
||||
let v_b = v_base.add(b_pos);
|
||||
|
||||
// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
|
||||
// in a well defined state, without duplicates.
|
||||
|
||||
// Important to only swap if it is more and not if it is equal. is_less should return false for
|
||||
// equal, so we don't swap.
|
||||
let should_swap = is_less(&*v_b, &*v_a);
|
||||
|
||||
// This is a branchless version of swap if.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if should_swap {
|
||||
// ptr::swap(left, right, 1);
|
||||
// }
|
||||
|
||||
// The goal is to generate cmov instructions here.
|
||||
let left_swap = if should_swap { v_b } else { v_a };
|
||||
let right_swap = if should_swap { v_a } else { v_b };
|
||||
|
||||
let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
|
||||
ptr::copy(left_swap, v_a, 1);
|
||||
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
// performance impact.
|
||||
fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if v.len() < 9 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
// Optimal sorting network see:
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html.
|
||||
|
||||
// SAFETY: We checked the len.
|
||||
unsafe {
|
||||
swap_if_less(v_base, 0, 3, is_less);
|
||||
swap_if_less(v_base, 1, 7, is_less);
|
||||
swap_if_less(v_base, 2, 5, is_less);
|
||||
swap_if_less(v_base, 4, 8, is_less);
|
||||
swap_if_less(v_base, 0, 7, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 3, 8, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
swap_if_less(v_base, 0, 2, is_less);
|
||||
swap_if_less(v_base, 1, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 1, 4, is_less);
|
||||
swap_if_less(v_base, 3, 6, is_less);
|
||||
swap_if_less(v_base, 5, 7, is_less);
|
||||
swap_if_less(v_base, 0, 1, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 3, 5, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 6, 7, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
|
||||
// performance impact.
|
||||
fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
if v.len() < 13 {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
// Optimal sorting network see:
|
||||
// https://bertdobbelaere.github.io/sorting_networks.html.
|
||||
|
||||
// SAFETY: We checked the len.
|
||||
unsafe {
|
||||
swap_if_less(v_base, 0, 12, is_less);
|
||||
swap_if_less(v_base, 1, 10, is_less);
|
||||
swap_if_less(v_base, 2, 9, is_less);
|
||||
swap_if_less(v_base, 3, 7, is_less);
|
||||
swap_if_less(v_base, 5, 11, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 1, 6, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 11, is_less);
|
||||
swap_if_less(v_base, 7, 9, is_less);
|
||||
swap_if_less(v_base, 8, 10, is_less);
|
||||
swap_if_less(v_base, 0, 4, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 6, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 11, 12, is_less);
|
||||
swap_if_less(v_base, 4, 6, is_less);
|
||||
swap_if_less(v_base, 5, 9, is_less);
|
||||
swap_if_less(v_base, 8, 11, is_less);
|
||||
swap_if_less(v_base, 10, 12, is_less);
|
||||
swap_if_less(v_base, 0, 5, is_less);
|
||||
swap_if_less(v_base, 3, 8, is_less);
|
||||
swap_if_less(v_base, 4, 7, is_less);
|
||||
swap_if_less(v_base, 6, 11, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 0, 1, is_less);
|
||||
swap_if_less(v_base, 2, 5, is_less);
|
||||
swap_if_less(v_base, 6, 9, is_less);
|
||||
swap_if_less(v_base, 7, 8, is_less);
|
||||
swap_if_less(v_base, 10, 11, is_less);
|
||||
swap_if_less(v_base, 1, 3, is_less);
|
||||
swap_if_less(v_base, 2, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
swap_if_less(v_base, 9, 10, is_less);
|
||||
swap_if_less(v_base, 1, 2, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 7, is_less);
|
||||
swap_if_less(v_base, 6, 8, is_less);
|
||||
swap_if_less(v_base, 2, 3, is_less);
|
||||
swap_if_less(v_base, 4, 5, is_less);
|
||||
swap_if_less(v_base, 6, 7, is_less);
|
||||
swap_if_less(v_base, 8, 9, is_less);
|
||||
swap_if_less(v_base, 3, 4, is_less);
|
||||
swap_if_less(v_base, 5, 6, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
|
||||
///
|
||||
/// # Safety
|
||||
/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
|
||||
unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: in-bounds as tail > begin.
|
||||
let mut sift = tail.sub(1);
|
||||
if !is_less(&*tail, &*sift) {
|
||||
return;
|
||||
}
|
||||
|
||||
// SAFETY: after this read tail is never read from again, as we only ever
|
||||
// read from sift, sift < tail and we only ever decrease sift. Thus this is
|
||||
// effectively a move, not a copy. Should a panic occur, or we have found
|
||||
// the correct insertion position, gap_guard ensures the element is moved
|
||||
// back into the array.
|
||||
let tmp = ManuallyDrop::new(tail.read());
|
||||
let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
|
||||
|
||||
loop {
|
||||
// SAFETY: we move sift into the gap (which is valid), and point the
|
||||
// gap guard destination at sift, ensuring that if a panic occurs the
|
||||
// gap is once again filled.
|
||||
ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
|
||||
gap_guard.dst = sift;
|
||||
|
||||
if sift == begin {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: we checked that sift != begin, thus this is in-bounds.
|
||||
sift = sift.sub(1);
|
||||
if !is_less(&tmp, &*sift) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort `v` assuming `v[..offset]` is already sorted.
|
||||
pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
offset: usize,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if offset == 0 || offset > len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// We write this basic loop directly using pointers, as when we use a
|
||||
// for loop LLVM likes to unroll this loop which we do not want.
|
||||
// SAFETY: v_end is the one-past-end pointer, and we checked that
|
||||
// offset <= len, thus tail is also in-bounds.
|
||||
let v_base = v.as_mut_ptr();
|
||||
let v_end = v_base.add(len);
|
||||
let mut tail = v_base.add(offset);
|
||||
while tail != v_end {
|
||||
// SAFETY: v_base and tail are both valid pointers to elements, and
|
||||
// v_base < tail since we checked offset != 0.
|
||||
insert_tail(v_base, tail, is_less);
|
||||
|
||||
// SAFETY: we checked that tail is not yet the one-past-end pointer.
|
||||
tail = tail.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
|
||||
/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
|
||||
pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
|
||||
v_base: *const T,
|
||||
dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// By limiting select to picking pointers, we are guaranteed good cmov code-gen
|
||||
// regardless of type T's size. Further this only does 5 instead of 6
|
||||
// comparisons compared to a stable transposition 4 element sorting-network,
|
||||
// and always copies each element exactly once.
|
||||
|
||||
// SAFETY: all pointers have offset at most 3 from v_base and dst, and are
|
||||
// thus in-bounds by the precondition.
|
||||
unsafe {
|
||||
// Stably create two pairs a <= b and c <= d.
|
||||
let c1 = is_less(&*v_base.add(1), &*v_base);
|
||||
let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
|
||||
let a = v_base.add(c1 as usize);
|
||||
let b = v_base.add(!c1 as usize);
|
||||
let c = v_base.add(2 + c2 as usize);
|
||||
let d = v_base.add(2 + (!c2 as usize));
|
||||
|
||||
// Compare (a, c) and (b, d) to identify max/min. We're left with two
|
||||
// unknown elements, but because we are a stable sort we must know which
|
||||
// one is leftmost and which one is rightmost.
|
||||
// c3, c4 | min max unknown_left unknown_right
|
||||
// 0, 0 | a d b c
|
||||
// 0, 1 | a b c d
|
||||
// 1, 0 | c d a b
|
||||
// 1, 1 | c b a d
|
||||
let c3 = is_less(&*c, &*a);
|
||||
let c4 = is_less(&*d, &*b);
|
||||
let min = select(c3, c, a);
|
||||
let max = select(c4, b, d);
|
||||
let unknown_left = select(c3, a, select(c4, c, b));
|
||||
let unknown_right = select(c4, d, select(c3, b, c));
|
||||
|
||||
// Sort the last two unknown elements.
|
||||
let c5 = is_less(&*unknown_right, &*unknown_left);
|
||||
let lo = select(c5, unknown_right, unknown_left);
|
||||
let hi = select(c5, unknown_left, unknown_right);
|
||||
|
||||
ptr::copy_nonoverlapping(min, dst, 1);
|
||||
ptr::copy_nonoverlapping(lo, dst.add(1), 1);
|
||||
ptr::copy_nonoverlapping(hi, dst.add(2), 1);
|
||||
ptr::copy_nonoverlapping(max, dst.add(3), 1);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn select<T>(cond: bool, if_true: *const T, if_false: *const T) -> *const T {
|
||||
if cond { if_true } else { if_false }
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
|
||||
/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
|
||||
/// be stored in `dst[0..8]`.
|
||||
unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v_base: *mut T,
|
||||
dst: *mut T,
|
||||
scratch_base: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: these pointers are all in-bounds by the precondition of our function.
|
||||
unsafe {
|
||||
sort4_stable(v_base, scratch_base, is_less);
|
||||
sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
|
||||
}
|
||||
|
||||
// SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
|
||||
// into dst.
|
||||
unsafe {
|
||||
bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut left_src: *const T,
|
||||
mut right_src: *const T,
|
||||
mut dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) -> (*const T, *const T, *mut T) {
|
||||
// This is a branchless merge utility function.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if !is_less(&*right_src, &*left_src) {
|
||||
// ptr::copy_nonoverlapping(left_src, dst, 1);
|
||||
// left_src = left_src.add(1);
|
||||
// } else {
|
||||
// ptr::copy_nonoverlapping(right_src, dst, 1);
|
||||
// right_src = right_src.add(1);
|
||||
// }
|
||||
// dst = dst.add(1);
|
||||
|
||||
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
|
||||
// to read and `dst` is valid to write, while not aliasing.
|
||||
unsafe {
|
||||
let is_l = !is_less(&*right_src, &*left_src);
|
||||
let src = if is_l { left_src } else { right_src };
|
||||
ptr::copy_nonoverlapping(src, dst, 1);
|
||||
right_src = right_src.add(!is_l as usize);
|
||||
left_src = left_src.add(is_l as usize);
|
||||
dst = dst.add(1);
|
||||
}
|
||||
|
||||
(left_src, right_src, dst)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut left_src: *const T,
|
||||
mut right_src: *const T,
|
||||
mut dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) -> (*const T, *const T, *mut T) {
|
||||
// This is a branchless merge utility function.
|
||||
// The equivalent code with a branch would be:
|
||||
//
|
||||
// if !is_less(&*right_src, &*left_src) {
|
||||
// ptr::copy_nonoverlapping(right_src, dst, 1);
|
||||
// right_src = right_src.wrapping_sub(1);
|
||||
// } else {
|
||||
// ptr::copy_nonoverlapping(left_src, dst, 1);
|
||||
// left_src = left_src.wrapping_sub(1);
|
||||
// }
|
||||
// dst = dst.sub(1);
|
||||
|
||||
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
|
||||
// to read and `dst` is valid to write, while not aliasing.
|
||||
unsafe {
|
||||
let is_l = !is_less(&*right_src, &*left_src);
|
||||
let src = if is_l { right_src } else { left_src };
|
||||
ptr::copy_nonoverlapping(src, dst, 1);
|
||||
right_src = right_src.wrapping_sub(is_l as usize);
|
||||
left_src = left_src.wrapping_sub(!is_l as usize);
|
||||
dst = dst.sub(1);
|
||||
}
|
||||
|
||||
(left_src, right_src, dst)
|
||||
}
|
||||
|
||||
/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
|
||||
///
|
||||
/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
|
||||
/// adapted to only use merge up and down. In contrast to the original
|
||||
/// parity_merge function, it performs 2 writes instead of 4 per iteration.
|
||||
///
|
||||
/// # Safety
|
||||
/// The caller must guarantee that `dst` is valid for v.len() writes.
|
||||
/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
|
||||
///
|
||||
/// Note that T must be Freeze, the comparison function is evaluated on outdated
|
||||
/// temporary 'copies' that may not end up in the final array.
|
||||
unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
|
||||
v: &[T],
|
||||
dst: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// It helps to visualize the merge:
|
||||
//
|
||||
// Initial:
|
||||
//
|
||||
// |dst (in dst)
|
||||
// |left |right
|
||||
// v v
|
||||
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
|
||||
// ^ ^
|
||||
// |left_rev |right_rev
|
||||
// |dst_rev (in dst)
|
||||
//
|
||||
// After:
|
||||
//
|
||||
// |dst (in dst)
|
||||
// |left | |right
|
||||
// v v v
|
||||
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
|
||||
// ^ ^ ^
|
||||
// |left_rev | |right_rev
|
||||
// |dst_rev (in dst)
|
||||
//
|
||||
// In each iteration one of left or right moves up one position, and one of
|
||||
// left_rev or right_rev moves down one position, whereas dst always moves
|
||||
// up one position and dst_rev always moves down one position. Assuming
|
||||
// the input was sorted and the comparison function is correctly implemented
|
||||
// at the end we will have left == left_rev + 1, and right == right_rev + 1,
|
||||
// fully consuming the input having written it to dst.
|
||||
|
||||
let len = v.len();
|
||||
let src = v.as_ptr();
|
||||
|
||||
let len_div_2 = len / 2;
|
||||
|
||||
// SAFETY: The caller has to ensure that len >= 2.
|
||||
unsafe {
|
||||
intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
|
||||
}
|
||||
|
||||
// SAFETY: no matter what the result of the user-provided comparison function
|
||||
// is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
|
||||
// will always be in bounds if the caller guarantees that `dst` is valid for
|
||||
// `v.len()` writes.
|
||||
unsafe {
|
||||
let mut left = src;
|
||||
let mut right = src.add(len_div_2);
|
||||
let mut dst = dst;
|
||||
|
||||
let mut left_rev = src.add(len_div_2 - 1);
|
||||
let mut right_rev = src.add(len - 1);
|
||||
let mut dst_rev = dst.add(len - 1);
|
||||
|
||||
for _ in 0..len_div_2 {
|
||||
(left, right, dst) = merge_up(left, right, dst, is_less);
|
||||
(left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
|
||||
}
|
||||
|
||||
let left_end = left_rev.wrapping_add(1);
|
||||
let right_end = right_rev.wrapping_add(1);
|
||||
|
||||
// Odd length, so one element is left unconsumed in the input.
|
||||
if len % 2 != 0 {
|
||||
let left_nonempty = left < left_end;
|
||||
let last_src = if left_nonempty { left } else { right };
|
||||
ptr::copy_nonoverlapping(last_src, dst, 1);
|
||||
left = left.add(left_nonempty as usize);
|
||||
right = right.add((!left_nonempty) as usize);
|
||||
}
|
||||
|
||||
// We now should have consumed the full input exactly once. This can
|
||||
// only fail if the comparison operator fails to be Ord, in which case
|
||||
// we will panic and never access the inconsistent state in dst.
|
||||
if left != left_end || right != right_end {
|
||||
panic_on_ord_violation();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn panic_on_ord_violation() -> ! {
|
||||
panic!("Ord violation");
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
|
||||
// Heuristic that holds true on all tested 64-bit capable architectures.
|
||||
mem::size_of::<T>() <= 8 // mem::size_of::<u64>()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type_info() {
|
||||
assert!(has_efficient_in_place_swap::<i32>());
|
||||
assert!(has_efficient_in_place_swap::<u64>());
|
||||
assert!(!has_efficient_in_place_swap::<u128>());
|
||||
assert!(!has_efficient_in_place_swap::<String>());
|
||||
}
|
||||
|
||||
/// SAFETY: Only used for run-time optimization heuristic.
|
||||
#[rustc_unsafe_specialization_marker]
|
||||
trait CopyMarker {}
|
||||
|
||||
impl<T: Copy> CopyMarker for T {}
|
||||
|
||||
#[const_trait]
|
||||
trait IsCopy {
|
||||
fn is_copy() -> bool;
|
||||
}
|
||||
|
||||
impl<T> const IsCopy for T {
|
||||
default fn is_copy() -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
impl<T: CopyMarker> const IsCopy for T {
|
||||
fn is_copy() -> bool {
|
||||
true
|
||||
}
|
||||
}
|
300
library/core/src/slice/sort/stable/drift.rs
Normal file
300
library/core/src/slice/sort/stable/drift.rs
Normal file
@ -0,0 +1,300 @@
|
||||
//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
|
||||
//! Quicksort.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::intrinsics;
|
||||
use crate::mem::MaybeUninit;
|
||||
|
||||
use crate::slice::sort::shared::find_existing_run;
|
||||
use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
|
||||
use crate::slice::sort::stable::merge::merge;
|
||||
use crate::slice::sort::stable::quicksort::quicksort;
|
||||
|
||||
/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
|
||||
/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
|
||||
/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2,
|
||||
/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort.
|
||||
/// Fully ascending and descending inputs will be sorted with exactly N - 1
|
||||
/// comparisons.
|
||||
///
|
||||
/// This is the main loop for driftsort, which uses powersort's heuristic to
|
||||
/// determine in which order to merge runs, see below for details.
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
eager_sort: bool,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
if len < 2 {
|
||||
return; // Removing this length check *increases* code size.
|
||||
}
|
||||
let scale_factor = merge_tree_scale_factor(len);
|
||||
|
||||
// It's important to have a relatively high entry barrier for pre-sorted
|
||||
// runs, as the presence of a single such run will force on average several
|
||||
// merge operations and shrink the maximum quicksort size a lot. For that
|
||||
// reason we use sqrt(len) as our pre-sorted run threshold.
|
||||
const MIN_SQRT_RUN_LEN: usize = 64;
|
||||
let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
|
||||
// For small input length `MIN_SQRT_RUN_LEN` would break pattern
|
||||
// detection of full or nearly sorted inputs.
|
||||
cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
|
||||
} else {
|
||||
sqrt_approx(len)
|
||||
};
|
||||
|
||||
// (stack_len, runs, desired_depths) together form a stack maintaining run
|
||||
// information for the powersort heuristic. desired_depths[i] is the desired
|
||||
// depth of the merge node that merges runs[i] with the run that comes after
|
||||
// it.
|
||||
let mut stack_len = 0;
|
||||
let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
|
||||
let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
|
||||
let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
|
||||
let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
|
||||
|
||||
let mut scan_idx = 0;
|
||||
let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
|
||||
loop {
|
||||
// Compute the next run and the desired depth of the merge node between
|
||||
// prev_run and next_run. On the last iteration we create a dummy run
|
||||
// with root-level desired depth to fully collapse the merge tree.
|
||||
let (next_run, desired_depth);
|
||||
if scan_idx < len {
|
||||
next_run =
|
||||
create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
|
||||
desired_depth = merge_tree_depth(
|
||||
scan_idx - prev_run.len(),
|
||||
scan_idx,
|
||||
scan_idx + next_run.len(),
|
||||
scale_factor,
|
||||
);
|
||||
} else {
|
||||
next_run = DriftsortRun::new_sorted(0);
|
||||
desired_depth = 0;
|
||||
};
|
||||
|
||||
// Process the merge nodes between earlier runs[i] that have a desire to
|
||||
// be deeper in the merge tree than the merge node for the splitpoint
|
||||
// between prev_run and next_run.
|
||||
//
|
||||
// SAFETY: first note that this is the only place we modify stack_len,
|
||||
// runs or desired depths. We maintain the following invariants:
|
||||
// 1. The first stack_len elements of runs/desired_depths are initialized.
|
||||
// 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
|
||||
// 3. The sum of all valid runs[i].len() plus prev_run.len() equals
|
||||
// scan_idx.
|
||||
unsafe {
|
||||
while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
|
||||
// Desired depth greater than the upcoming desired depth, pop
|
||||
// left neighbor run from stack and merge into prev_run.
|
||||
let left = *runs.add(stack_len - 1);
|
||||
let merged_len = left.len() + prev_run.len();
|
||||
let merge_start_idx = scan_idx - merged_len;
|
||||
let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
|
||||
prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
|
||||
stack_len -= 1;
|
||||
}
|
||||
|
||||
// We now know that desired_depths[stack_len - 1] < desired_depth,
|
||||
// maintaining our invariant. This also guarantees we don't overflow
|
||||
// the stack as merge_tree_depth(..) <= 64 and thus we can only have
|
||||
// 64 distinct values on the stack before pushing, plus our initial
|
||||
// dummy run, while our capacity is 66.
|
||||
*runs.add(stack_len) = prev_run;
|
||||
*desired_depths.add(stack_len) = desired_depth;
|
||||
stack_len += 1;
|
||||
}
|
||||
|
||||
// Break before overriding the last run with our dummy run.
|
||||
if scan_idx >= len {
|
||||
break;
|
||||
}
|
||||
|
||||
scan_idx += next_run.len();
|
||||
prev_run = next_run;
|
||||
}
|
||||
|
||||
if !prev_run.sorted() {
|
||||
stable_quicksort(v, scratch, is_less);
|
||||
}
|
||||
}
|
||||
|
||||
// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
|
||||
// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
|
||||
//
|
||||
// This method forms a binary merge tree, where each internal node corresponds
|
||||
// to a splitting point between the adjacent runs that have to be merged. If we
|
||||
// visualize our array as the number line from 0 to 1, we want to find the
|
||||
// dyadic fraction with smallest denominator that lies between the midpoints of
|
||||
// our to-be-merged slices. The exponent in the dyadic fraction indicates the
|
||||
// desired depth in the binary merge tree this internal node wishes to have.
|
||||
// This does not always correspond to the actual depth due to the inherent
|
||||
// imbalance in runs, but we follow it as closely as possible.
|
||||
//
|
||||
// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
|
||||
// finding the simplest dyadic fraction between midpoints corresponds to finding
|
||||
// the most significant bit difference of the midpoints. We save scale_factor =
|
||||
// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
|
||||
// having to repeatedly do integer divides. This rescaling isn't exact when n is
|
||||
// not a power of two since we use integers and not reals, but the result is
|
||||
// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
|
||||
// approximation errors stay entirely in the lower order bits.
|
||||
//
|
||||
// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
|
||||
// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
|
||||
// where CLZ counts the number of leading zeros in an integer and f is our scale
|
||||
// factor. Note that we omitted the division by two in the midpoint
|
||||
// calculations, as this simply shifts the bits by one position (and thus always
|
||||
// adds one to the result), and we only care about the relative depths.
|
||||
//
|
||||
// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
|
||||
// x < (2^62 / n + 1) * 2n
|
||||
// x < 2^63 + 2n
|
||||
// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
|
||||
// overflow.
|
||||
#[inline(always)]
|
||||
fn merge_tree_scale_factor(n: usize) -> u64 {
|
||||
if usize::BITS > u64::BITS {
|
||||
panic!("Platform not supported");
|
||||
}
|
||||
|
||||
((1 << 62) + n as u64 - 1) / n as u64
|
||||
}
|
||||
|
||||
// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
|
||||
// differ in some bit, and is <= 64 always.
|
||||
#[inline(always)]
|
||||
fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
|
||||
let x = left as u64 + mid as u64;
|
||||
let y = mid as u64 + right as u64;
|
||||
((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
|
||||
}
|
||||
|
||||
fn sqrt_approx(n: usize) -> usize {
|
||||
// Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
|
||||
// two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
|
||||
// log floors we want to add 0.5 to compensate for this on average, so our
|
||||
// initial approximation is 2^((1 + floor(log2(n))) / 2).
|
||||
//
|
||||
// We then apply an iteration of Newton's method to improve our
|
||||
// approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
|
||||
//
|
||||
// Finally we note that the exponentiation / division can be done directly
|
||||
// with shifts. We OR with 1 to avoid zero-checks in the integer log.
|
||||
let ilog = (n | 1).ilog2();
|
||||
let shift = (1 + ilog) / 2;
|
||||
((1 << shift) + (n >> shift)) / 2
|
||||
}
|
||||
|
||||
// Lazy logical runs as in Glidesort.
|
||||
#[inline(always)]
|
||||
fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
left: DriftsortRun,
|
||||
right: DriftsortRun,
|
||||
is_less: &mut F,
|
||||
) -> DriftsortRun {
|
||||
// If one or both of the runs are sorted do a physical merge, using
|
||||
// quicksort to sort the unsorted run if present. We also *need* to
|
||||
// physically merge if the combined runs would not fit in the scratch space
|
||||
// anymore (as this would mean we are no longer able to to quicksort them).
|
||||
let len = v.len();
|
||||
let can_fit_in_scratch = len <= scratch.len();
|
||||
if !can_fit_in_scratch || left.sorted() || right.sorted() {
|
||||
if !left.sorted() {
|
||||
stable_quicksort(&mut v[..left.len()], scratch, is_less);
|
||||
}
|
||||
if !right.sorted() {
|
||||
stable_quicksort(&mut v[left.len()..], scratch, is_less);
|
||||
}
|
||||
merge(v, scratch, left.len(), is_less);
|
||||
|
||||
DriftsortRun::new_sorted(len)
|
||||
} else {
|
||||
DriftsortRun::new_unsorted(len)
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new logical run.
|
||||
///
|
||||
/// A logical run can either be sorted or unsorted. If there is a pre-existing
|
||||
/// run that clears the `min_good_run_len` threshold it is returned as a sorted
|
||||
/// run. If not, the result depends on the value of `eager_sort`. If it is true,
|
||||
/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
|
||||
/// is false an unsorted run of length `min_good_run_len` is returned.
|
||||
fn create_run<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
min_good_run_len: usize,
|
||||
eager_sort: bool,
|
||||
is_less: &mut F,
|
||||
) -> DriftsortRun {
|
||||
let len = v.len();
|
||||
if len >= min_good_run_len {
|
||||
let (run_len, was_reversed) = find_existing_run(v, is_less);
|
||||
|
||||
// SAFETY: find_existing_run promises to return a valid run_len.
|
||||
unsafe { intrinsics::assume(run_len <= len) };
|
||||
|
||||
if run_len >= min_good_run_len {
|
||||
if was_reversed {
|
||||
v[..run_len].reverse();
|
||||
}
|
||||
|
||||
return DriftsortRun::new_sorted(run_len);
|
||||
}
|
||||
}
|
||||
|
||||
if eager_sort {
|
||||
// We call quicksort with a len that will immediately call small-sort.
|
||||
// By not calling the small-sort directly here it can always be inlined into
|
||||
// the quicksort itself, making the recursive base case faster and is generally
|
||||
// more binary-size efficient.
|
||||
let eager_run_len = cmp::min(T::small_sort_threshold(), len);
|
||||
quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
|
||||
DriftsortRun::new_sorted(eager_run_len)
|
||||
} else {
|
||||
DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
|
||||
}
|
||||
}
|
||||
|
||||
fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
|
||||
// The binary OR by one is used to eliminate the zero-check in the logarithm.
|
||||
let limit = 2 * (v.len() | 1).ilog2();
|
||||
quicksort(v, scratch, limit, None, is_less);
|
||||
}
|
||||
|
||||
/// Compactly stores the length of a run, and whether or not it is sorted. This
|
||||
/// can always fit in a usize because the maximum slice length is isize::MAX.
|
||||
#[derive(Copy, Clone)]
|
||||
struct DriftsortRun(usize);
|
||||
|
||||
impl DriftsortRun {
|
||||
#[inline(always)]
|
||||
fn new_sorted(length: usize) -> Self {
|
||||
Self((length << 1) | 1)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn new_unsorted(length: usize) -> Self {
|
||||
Self(length << 1)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sorted(self) -> bool {
|
||||
self.0 & 1 == 1
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn len(self) -> usize {
|
||||
self.0 >> 1
|
||||
}
|
||||
}
|
151
library/core/src/slice/sort/stable/merge.rs
Normal file
151
library/core/src/slice/sort/stable/merge.rs
Normal file
@ -0,0 +1,151 @@
|
||||
//! This module contains logic for performing a merge of two sorted sub-slices.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::mem::MaybeUninit;
|
||||
use crate::ptr;
|
||||
|
||||
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `scratch` as
|
||||
/// temporary storage, and stores the result into `v[..]`.
|
||||
pub fn merge<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
mid: usize,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
let len = v.len();
|
||||
|
||||
if mid == 0 || mid >= len || scratch.len() < cmp::min(mid, len - mid) {
|
||||
return;
|
||||
}
|
||||
|
||||
// SAFETY: We checked that the two slices are non-empty and `mid` is in-bounds.
|
||||
// We checked that the buffer `scratch` has enough capacity to hold a copy of
|
||||
// the shorter slice. `merge_up` and `merge_down` are written in such a way that
|
||||
// they uphold the contract described in `MergeState::drop`.
|
||||
unsafe {
|
||||
// The merge process first copies the shorter run into `buf`. Then it traces
|
||||
// the newly copied run and the longer run forwards (or backwards), comparing
|
||||
// their next unconsumed elements and copying the lesser (or greater) one into `v`.
|
||||
//
|
||||
// As soon as the shorter run is fully consumed, the process is done. If the
|
||||
// longer run gets consumed first, then we must copy whatever is left of the
|
||||
// shorter run into the remaining gap in `v`.
|
||||
//
|
||||
// Intermediate state of the process is always tracked by `gap`, which serves
|
||||
// two purposes:
|
||||
// 1. Protects integrity of `v` from panics in `is_less`.
|
||||
// 2. Fills the remaining gap in `v` if the longer run gets consumed first.
|
||||
|
||||
let buf = MaybeUninit::slice_as_mut_ptr(scratch);
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
let v_mid = v_base.add(mid);
|
||||
let v_end = v_base.add(len);
|
||||
|
||||
let left_len = mid;
|
||||
let right_len = len - mid;
|
||||
|
||||
let left_is_shorter = left_len <= right_len;
|
||||
let save_base = if left_is_shorter { v_base } else { v_mid };
|
||||
let save_len = if left_is_shorter { left_len } else { right_len };
|
||||
|
||||
ptr::copy_nonoverlapping(save_base, buf, save_len);
|
||||
|
||||
let mut merge_state = MergeState { start: buf, end: buf.add(save_len), dst: save_base };
|
||||
|
||||
if left_is_shorter {
|
||||
merge_state.merge_up(v_mid, v_end, is_less);
|
||||
} else {
|
||||
merge_state.merge_down(v_base, buf, v_end, is_less);
|
||||
}
|
||||
// Finally, `merge_state` gets dropped. If the shorter run was not fully
|
||||
// consumed, whatever remains of it will now be copied into the hole in `v`.
|
||||
}
|
||||
|
||||
// When dropped, copies the range `start..end` into `dst..`.
|
||||
struct MergeState<T> {
|
||||
start: *mut T,
|
||||
end: *mut T,
|
||||
dst: *mut T,
|
||||
}
|
||||
|
||||
impl<T> MergeState<T> {
|
||||
/// # Safety
|
||||
/// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is
|
||||
/// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice
|
||||
/// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be
|
||||
/// read. This function MUST only be called once.
|
||||
unsafe fn merge_up<F: FnMut(&T, &T) -> bool>(
|
||||
&mut self,
|
||||
mut right: *const T,
|
||||
right_end: *const T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe {
|
||||
let left = &mut self.start;
|
||||
let out = &mut self.dst;
|
||||
|
||||
while *left != self.end && right as *const T != right_end {
|
||||
let consume_left = !is_less(&*right, &**left);
|
||||
|
||||
let src = if consume_left { *left } else { right };
|
||||
ptr::copy_nonoverlapping(src, *out, 1);
|
||||
|
||||
*left = left.add(consume_left as usize);
|
||||
right = right.add(!consume_left as usize);
|
||||
|
||||
*out = out.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
/// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is
|
||||
/// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice
|
||||
/// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be
|
||||
/// read. This function MUST only be called once.
|
||||
unsafe fn merge_down<F: FnMut(&T, &T) -> bool>(
|
||||
&mut self,
|
||||
left_end: *const T,
|
||||
right_end: *const T,
|
||||
mut out: *mut T,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe {
|
||||
loop {
|
||||
let left = self.dst.sub(1);
|
||||
let right = self.end.sub(1);
|
||||
out = out.sub(1);
|
||||
|
||||
let consume_left = is_less(&*right, &*left);
|
||||
|
||||
let src = if consume_left { left } else { right };
|
||||
ptr::copy_nonoverlapping(src, out, 1);
|
||||
|
||||
self.dst = left.add(!consume_left as usize);
|
||||
self.end = right.add(consume_left as usize);
|
||||
|
||||
if self.dst as *const T == left_end || self.end as *const T == right_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for MergeState<T> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The user of MergeState MUST ensure, that at any point this drop
|
||||
// impl MAY run, for example when the user provided `is_less` panics, that
|
||||
// copying the contiguous region between `start` and `end` to `dst` will
|
||||
// leave the input slice `v` with each original element and all possible
|
||||
// modifications observed.
|
||||
unsafe {
|
||||
let len = self.end.sub_ptr(self.start);
|
||||
ptr::copy_nonoverlapping(self.start, self.dst, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
116
library/core/src/slice/sort/stable/mod.rs
Normal file
116
library/core/src/slice/sort/stable/mod.rs
Normal file
@ -0,0 +1,116 @@
|
||||
//! This module contains the entry points for `slice::sort`.
|
||||
|
||||
use crate::cmp;
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
|
||||
|
||||
use crate::slice::sort::shared::smallsort::{
|
||||
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
|
||||
};
|
||||
|
||||
pub(crate) mod drift;
|
||||
pub(crate) mod merge;
|
||||
pub(crate) mod quicksort;
|
||||
|
||||
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
|
||||
/// Design document:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md
|
||||
///
|
||||
/// Upholds all safety properties outlined here:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
|
||||
#[inline(always)]
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
|
||||
// Arrays of zero-sized types are always all-equal, and thus sorted.
|
||||
if T::IS_ZST {
|
||||
return;
|
||||
}
|
||||
|
||||
// Instrumenting the standard library showed that 90+% of the calls to sort
|
||||
// by rustc are either of size 0 or 1.
|
||||
let len = v.len();
|
||||
if intrinsics::likely(len < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
driftsort_main::<T, F, BufT>(v, is_less);
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[inline(never)]
|
||||
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
|
||||
// By allocating n elements of memory we can ensure the entire input can
|
||||
// be sorted using stable quicksort, which allows better performance on
|
||||
// random and low-cardinality distributions. However, we still want to
|
||||
// reduce our memory usage to n / 2 for large inputs. We do this by scaling
|
||||
// our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
|
||||
// small inputs and n / 2 for large inputs, without a sudden drop off. We
|
||||
// also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
|
||||
// small-sort always needs this much memory.
|
||||
const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
|
||||
let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
|
||||
let len = v.len();
|
||||
let alloc_len =
|
||||
cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);
|
||||
|
||||
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
|
||||
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
|
||||
let mut stack_buf = AlignedStorage::<T, 4096>::new();
|
||||
let stack_scratch = stack_buf.as_uninit_slice_mut();
|
||||
let mut heap_buf;
|
||||
let scratch = if stack_scratch.len() >= alloc_len {
|
||||
stack_scratch
|
||||
} else {
|
||||
heap_buf = BufT::with_capacity(alloc_len);
|
||||
heap_buf.as_uninit_slice_mut()
|
||||
};
|
||||
|
||||
// For small inputs using quicksort is not yet beneficial, and a single
|
||||
// small-sort or two small-sorts plus a single merge outperforms it, so use
|
||||
// eager mode.
|
||||
let eager_sort = len <= T::small_sort_threshold() * 2;
|
||||
crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
|
||||
/// possible. This trait can then be implemented in a place that has access to allocation.
|
||||
pub trait BufGuard<T> {
|
||||
/// Creates new buffer that holds at least `capacity` memory.
|
||||
fn with_capacity(capacity: usize) -> Self;
|
||||
/// Returns mutable access to uninitialized memory owned by the buffer.
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct AlignedStorage<T, const N: usize> {
|
||||
_align: [T; 0],
|
||||
storage: [MaybeUninit<u8>; N],
|
||||
}
|
||||
|
||||
impl<T, const N: usize> AlignedStorage<T, N> {
|
||||
fn new() -> Self {
|
||||
Self { _align: [], storage: MaybeUninit::uninit_array() }
|
||||
}
|
||||
|
||||
fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
|
||||
let len = N / mem::size_of::<T>();
|
||||
|
||||
// SAFETY: `_align` ensures we are correctly aligned.
|
||||
unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
|
||||
}
|
||||
}
|
267
library/core/src/slice/sort/stable/quicksort.rs
Normal file
267
library/core/src/slice/sort/stable/quicksort.rs
Normal file
@ -0,0 +1,267 @@
|
||||
//! This module contains a stable quicksort and partition implementation.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop, MaybeUninit};
|
||||
use crate::ptr;
|
||||
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
|
||||
use crate::slice::sort::shared::FreezeMarker;
|
||||
|
||||
/// Sorts `v` recursively using quicksort.
|
||||
///
|
||||
/// `limit` when initialized with `c*log(v.len())` for some c ensures we do not
|
||||
/// overflow the stack or go quadratic.
|
||||
#[inline(never)]
|
||||
pub fn quicksort<T, F: FnMut(&T, &T) -> bool>(
|
||||
mut v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
mut limit: u32,
|
||||
mut left_ancestor_pivot: Option<&T>,
|
||||
is_less: &mut F,
|
||||
) {
|
||||
loop {
|
||||
let len = v.len();
|
||||
|
||||
if len <= T::small_sort_threshold() {
|
||||
T::small_sort(v, scratch, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
// We have had too many bad pivots, switch to O(n log n) fallback
|
||||
// algorithm. In our case that is driftsort in eager mode.
|
||||
crate::slice::sort::stable::drift::sort(v, scratch, true, is_less);
|
||||
return;
|
||||
}
|
||||
limit -= 1;
|
||||
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
// SAFETY: choose_pivot promises to return a valid pivot index.
|
||||
unsafe {
|
||||
intrinsics::assume(pivot_pos < v.len());
|
||||
}
|
||||
|
||||
// SAFETY: We only access the temporary copy for Freeze types, otherwise
|
||||
// self-modifications via `is_less` would not be observed and this would
|
||||
// be unsound. Our temporary copy does not escape this scope.
|
||||
let pivot_copy = unsafe { ManuallyDrop::new(ptr::read(&v[pivot_pos])) };
|
||||
let pivot_ref = (!has_direct_interior_mutability::<T>()).then_some(&*pivot_copy);
|
||||
|
||||
// We choose a pivot, and check if this pivot is equal to our left
|
||||
// ancestor. If true, we do a partition putting equal elements on the
|
||||
// left and do not recurse on it. This gives O(n log k) sorting for k
|
||||
// distinct values, a strategy borrowed from pdqsort. For types with
|
||||
// interior mutability we can't soundly create a temporary copy of the
|
||||
// ancestor pivot, and use left_partition_len == 0 as our method for
|
||||
// detecting when we re-use a pivot, which means we do at most three
|
||||
// partition operations with pivot p instead of the optimal two.
|
||||
let mut perform_equal_partition = false;
|
||||
if let Some(la_pivot) = left_ancestor_pivot {
|
||||
perform_equal_partition = !is_less(la_pivot, &v[pivot_pos]);
|
||||
}
|
||||
|
||||
let mut left_partition_len = 0;
|
||||
if !perform_equal_partition {
|
||||
left_partition_len = stable_partition(v, scratch, pivot_pos, false, is_less);
|
||||
perform_equal_partition = left_partition_len == 0;
|
||||
}
|
||||
|
||||
if perform_equal_partition {
|
||||
let mid_eq = stable_partition(v, scratch, pivot_pos, true, &mut |a, b| !is_less(b, a));
|
||||
v = &mut v[mid_eq..];
|
||||
left_ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process left side with the next loop iter, right side with recursion.
|
||||
let (left, right) = v.split_at_mut(left_partition_len);
|
||||
quicksort(right, scratch, limit, pivot_ref, is_less);
|
||||
v = left;
|
||||
}
|
||||
}
|
||||
|
||||
/// Partitions `v` using pivot `p = v[pivot_pos]` and returns the number of
|
||||
/// elements less than `p`. The relative order of elements that compare < p and
|
||||
/// those that compare >= p is preserved - it is a stable partition.
|
||||
///
|
||||
/// If `is_less` is not a strict total order or panics, `scratch.len() < v.len()`,
|
||||
/// or `pivot_pos >= v.len()`, the result and `v`'s state is sound but unspecified.
|
||||
fn stable_partition<T, F: FnMut(&T, &T) -> bool>(
|
||||
v: &mut [T],
|
||||
scratch: &mut [MaybeUninit<T>],
|
||||
pivot_pos: usize,
|
||||
pivot_goes_left: bool,
|
||||
is_less: &mut F,
|
||||
) -> usize {
|
||||
let len = v.len();
|
||||
|
||||
if intrinsics::unlikely(scratch.len() < len || pivot_pos >= len) {
|
||||
core::intrinsics::abort()
|
||||
}
|
||||
|
||||
let v_base = v.as_ptr();
|
||||
let scratch_base = MaybeUninit::slice_as_mut_ptr(scratch);
|
||||
|
||||
// The core idea is to write the values that compare as less-than to the left
|
||||
// side of `scratch`, while the values that compared as greater or equal than
|
||||
// `v[pivot_pos]` go to the right side of `scratch` in reverse. See
|
||||
// PartitionState for details.
|
||||
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: we made sure the scratch has length >= len and that pivot_pos
|
||||
// is in-bounds. v and scratch are disjoint slices.
|
||||
let pivot = v_base.add(pivot_pos);
|
||||
let mut state = PartitionState::new(v_base, scratch_base, len);
|
||||
|
||||
let mut pivot_in_scratch = ptr::null_mut();
|
||||
let mut loop_end_pos = pivot_pos;
|
||||
|
||||
// SAFETY: this loop is equivalent to calling state.partition_one
|
||||
// exactly len times.
|
||||
loop {
|
||||
// Ideally the outer loop won't be unrolled, to save binary size,
|
||||
// but we do want the inner loop to be unrolled for small types, as
|
||||
// this gave significant performance boosts in benchmarks. Unrolling
|
||||
// through for _ in 0..UNROLL_LEN { .. } instead of manually improves
|
||||
// compile times but has a ~10-20% performance penalty on opt-level=s.
|
||||
if const { mem::size_of::<T>() <= 16 } {
|
||||
const UNROLL_LEN: usize = 4;
|
||||
let unroll_end = v_base.add(loop_end_pos.saturating_sub(UNROLL_LEN - 1));
|
||||
while state.scan < unroll_end {
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
}
|
||||
}
|
||||
|
||||
let loop_end = v_base.add(loop_end_pos);
|
||||
while state.scan < loop_end {
|
||||
state.partition_one(is_less(&*state.scan, &*pivot));
|
||||
}
|
||||
|
||||
if loop_end_pos == len {
|
||||
break;
|
||||
}
|
||||
|
||||
// We avoid comparing pivot with itself, as this could create deadlocks for
|
||||
// certain comparison operators. We also store its location later for later.
|
||||
pivot_in_scratch = state.partition_one(pivot_goes_left);
|
||||
|
||||
loop_end_pos = len;
|
||||
}
|
||||
|
||||
// `pivot` must be copied into its correct position again, because a
|
||||
// comparison operator might have modified it.
|
||||
if has_direct_interior_mutability::<T>() {
|
||||
ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
|
||||
}
|
||||
|
||||
// SAFETY: partition_one being called exactly len times guarantees that scratch
|
||||
// is initialized with a permuted copy of `v`, and that num_left <= v.len().
|
||||
// Copying scratch[0..num_left] and scratch[num_left..v.len()] back is thus
|
||||
// sound, as the values in scratch will never be read again, meaning our copies
|
||||
// semantically act as moves, permuting `v`.
|
||||
|
||||
// Copy all the elements < p directly from swap to v.
|
||||
let v_base = v.as_mut_ptr();
|
||||
ptr::copy_nonoverlapping(scratch_base, v_base, state.num_left);
|
||||
|
||||
// Copy the elements >= p in reverse order.
|
||||
for i in 0..len - state.num_left {
|
||||
ptr::copy_nonoverlapping(
|
||||
scratch_base.add(len - 1 - i),
|
||||
v_base.add(state.num_left + i),
|
||||
1,
|
||||
);
|
||||
}
|
||||
|
||||
state.num_left
|
||||
}
|
||||
}
|
||||
|
||||
struct PartitionState<T> {
|
||||
// The start of the scratch auxiliary memory.
|
||||
scratch_base: *mut T,
|
||||
// The current element that is being looked at, scans left to right through slice.
|
||||
scan: *const T,
|
||||
// Counts the number of elements that went to the left side, also works around:
|
||||
// https://github.com/rust-lang/rust/issues/117128
|
||||
num_left: usize,
|
||||
// Reverse scratch output pointer.
|
||||
scratch_rev: *mut T,
|
||||
}
|
||||
|
||||
impl<T> PartitionState<T> {
|
||||
/// # Safety
|
||||
/// scan and scratch must point to valid disjoint buffers of length len. The
|
||||
/// scan buffer must be initialized.
|
||||
unsafe fn new(scan: *const T, scratch: *mut T, len: usize) -> Self {
|
||||
// SAFETY: See function safety comment.
|
||||
unsafe { Self { scratch_base: scratch, scan, num_left: 0, scratch_rev: scratch.add(len) } }
|
||||
}
|
||||
|
||||
/// Depending on the value of `towards_left` this function will write a value
|
||||
/// to the growing left or right side of the scratch memory. This forms the
|
||||
/// branchless core of the partition.
|
||||
///
|
||||
/// # Safety
|
||||
/// This function may be called at most `len` times. If it is called exactly
|
||||
/// `len` times the scratch buffer then contains a copy of each element from
|
||||
/// the scan buffer exactly once - a permutation, and num_left <= len.
|
||||
unsafe fn partition_one(&mut self, towards_left: bool) -> *mut T {
|
||||
// SAFETY: see individual comments.
|
||||
unsafe {
|
||||
// SAFETY: in-bounds because this function is called at most len times, and thus
|
||||
// right now is incremented at most len - 1 times. Similarly, num_left < len and
|
||||
// num_right < len, where num_right == i - num_left at the start of the ith
|
||||
// iteration (zero-indexed).
|
||||
self.scratch_rev = self.scratch_rev.sub(1);
|
||||
|
||||
// SAFETY: now we have scratch_rev == base + len - (i + 1). This means
|
||||
// scratch_rev + num_left == base + len - 1 - num_right < base + len.
|
||||
let dst_base = if towards_left { self.scratch_base } else { self.scratch_rev };
|
||||
let dst = dst_base.add(self.num_left);
|
||||
ptr::copy_nonoverlapping(self.scan, dst, 1);
|
||||
|
||||
self.num_left += towards_left as usize;
|
||||
self.scan = self.scan.add(1);
|
||||
dst
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[const_trait]
|
||||
trait IsFreeze {
|
||||
fn is_freeze() -> bool;
|
||||
}
|
||||
|
||||
impl<T> const IsFreeze for T {
|
||||
default fn is_freeze() -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
impl<T: FreezeMarker> const IsFreeze for T {
|
||||
fn is_freeze() -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
const fn has_direct_interior_mutability<T>() -> bool {
|
||||
// If a type has interior mutability it may alter itself during comparison
|
||||
// in a way that must be preserved after the sort operation concludes.
|
||||
// Otherwise a type like Mutex<Option<Box<str>>> could lead to double free.
|
||||
!T::is_freeze()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn freeze_check() {
|
||||
assert!(!has_direct_interior_mutability::<u32>());
|
||||
assert!(!has_direct_interior_mutability::<[u128; 2]>());
|
||||
|
||||
assert!(has_direct_interior_mutability::<crate::cell::Cell<u32>>());
|
||||
assert!(has_direct_interior_mutability::<crate::sync::Mutex<u32>>());
|
||||
}
|
80
library/core/src/slice/sort/unstable/heapsort.rs
Normal file
80
library/core/src/slice/sort/unstable/heapsort.rs
Normal file
@ -0,0 +1,80 @@
|
||||
//! This module contains a branchless heapsort as fallback for unstable quicksort.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::ptr;
|
||||
|
||||
/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
|
||||
///
|
||||
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
|
||||
/// fallback.
|
||||
///
|
||||
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
|
||||
#[inline(never)]
|
||||
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(v.len() >= 2);
|
||||
|
||||
// Build the heap in linear time.
|
||||
for i in (0..v.len() / 2).rev() {
|
||||
sift_down(v, i, is_less);
|
||||
}
|
||||
|
||||
// Pop maximal elements from the heap.
|
||||
for i in (1..v.len()).rev() {
|
||||
v.swap(0, i);
|
||||
sift_down(&mut v[..i], 0, is_less);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This binary heap respects the invariant `parent >= child`.
|
||||
//
|
||||
// SAFETY: The caller has to guarantee that node < `v.len()`.
|
||||
#[inline(never)]
|
||||
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// SAFETY: See function safety.
|
||||
unsafe {
|
||||
intrinsics::assume(node < v.len());
|
||||
}
|
||||
|
||||
let len = v.len();
|
||||
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
loop {
|
||||
// Children of `node`.
|
||||
let mut child = 2 * node + 1;
|
||||
if child >= len {
|
||||
break;
|
||||
}
|
||||
|
||||
// SAFETY: The invariants and checks guarantee that both node and child are in-bounds.
|
||||
unsafe {
|
||||
// Choose the greater child.
|
||||
if child + 1 < len {
|
||||
// We need a branch to be sure not to out-of-bounds index,
|
||||
// but it's highly predictable. The comparison, however,
|
||||
// is better done branchless, especially for primitives.
|
||||
child += is_less(&*v_base.add(child), &*v_base.add(child + 1)) as usize;
|
||||
}
|
||||
|
||||
// Stop if the invariant holds at `node`.
|
||||
if !is_less(&*v_base.add(node), &*v_base.add(child)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap `node` with the greater child, move one step down, and continue sifting. This
|
||||
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
|
||||
ptr::swap(v_base.add(node), v_base.add(child));
|
||||
}
|
||||
|
||||
node = child;
|
||||
}
|
||||
}
|
76
library/core/src/slice/sort/unstable/mod.rs
Normal file
76
library/core/src/slice/sort/unstable/mod.rs
Normal file
@ -0,0 +1,76 @@
|
||||
//! This module contains the entry points for `slice::sort_unstable`.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::SizedTypeProperties;
|
||||
|
||||
use crate::slice::sort::shared::find_existing_run;
|
||||
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
|
||||
|
||||
pub(crate) mod heapsort;
|
||||
pub(crate) mod quicksort;
|
||||
|
||||
/// Unstable sort called ipnsort by Lukas Bergdoll.
|
||||
/// Design document:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md
|
||||
///
|
||||
/// Upholds all safety properties outlined here:
|
||||
/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
|
||||
#[inline(always)]
|
||||
pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
|
||||
// Arrays of zero-sized types are always all-equal, and thus sorted.
|
||||
if T::IS_ZST {
|
||||
return;
|
||||
}
|
||||
|
||||
// Instrumenting the standard library showed that 90+% of the calls to sort
|
||||
// by rustc are either of size 0 or 1.
|
||||
let len = v.len();
|
||||
if intrinsics::likely(len < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// More advanced sorting methods than insertion sort are faster if called in
|
||||
// a hot loop for small inputs, but for general-purpose code the small
|
||||
// binary size of insertion sort is more important. The instruction cache in
|
||||
// modern processors is very valuable, and for a single sort call in general
|
||||
// purpose code any gains from an advanced method are cancelled by i-cache
|
||||
// misses during the sort, and thrashing the i-cache for surrounding code.
|
||||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
|
||||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
|
||||
insertion_sort_shift_left(v, 1, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
ipnsort(v, is_less);
|
||||
}
|
||||
|
||||
/// See [`sort`]
|
||||
///
|
||||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
|
||||
/// inlined insertion sort i-cache footprint remains minimal.
|
||||
#[inline(never)]
|
||||
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
let (run_len, was_reversed) = find_existing_run(v, is_less);
|
||||
|
||||
// SAFETY: find_existing_run promises to return a valid run_len.
|
||||
unsafe { intrinsics::assume(run_len <= len) };
|
||||
|
||||
if run_len == len {
|
||||
if was_reversed {
|
||||
v.reverse();
|
||||
}
|
||||
|
||||
// It would be possible to a do in-place merging here for a long existing streak. But that
|
||||
// makes the implementation a lot bigger, users can use `slice::sort` for that use-case.
|
||||
return;
|
||||
}
|
||||
|
||||
// Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
|
||||
// The binary OR by one is used to eliminate the zero-check in the logarithm.
|
||||
let limit = 2 * (len | 1).ilog2();
|
||||
crate::slice::sort::unstable::quicksort::quicksort(v, None, limit, is_less);
|
||||
}
|
347
library/core/src/slice/sort/unstable/quicksort.rs
Normal file
347
library/core/src/slice/sort/unstable/quicksort.rs
Normal file
@ -0,0 +1,347 @@
|
||||
//! This module contains an unstable quicksort and two partition implementations.
|
||||
|
||||
use crate::intrinsics;
|
||||
use crate::mem::{self, ManuallyDrop};
|
||||
use crate::ptr;
|
||||
|
||||
use crate::slice::sort::shared::pivot::choose_pivot;
|
||||
use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
|
||||
|
||||
/// Sorts `v` recursively.
|
||||
///
|
||||
/// If the slice had a predecessor in the original array, it is specified as `ancestor_pivot`.
|
||||
///
|
||||
/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
|
||||
/// this function will immediately switch to heapsort.
|
||||
pub(crate) fn quicksort<'a, T, F>(
|
||||
mut v: &'a mut [T],
|
||||
mut ancestor_pivot: Option<&'a T>,
|
||||
mut limit: u32,
|
||||
is_less: &mut F,
|
||||
) where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
loop {
|
||||
if v.len() <= T::small_sort_threshold() {
|
||||
T::small_sort(v, is_less);
|
||||
return;
|
||||
}
|
||||
|
||||
// If too many bad pivot choices were made, simply fall back to heapsort in order to
|
||||
// guarantee `O(N x log(N))` worst-case.
|
||||
if limit == 0 {
|
||||
// SAFETY: We assume the `small_sort` threshold is at least 1.
|
||||
unsafe {
|
||||
crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
limit -= 1;
|
||||
|
||||
// Choose a pivot and try guessing whether the slice is already sorted.
|
||||
let pivot_pos = choose_pivot(v, is_less);
|
||||
|
||||
// If the chosen pivot is equal to the predecessor, then it's the smallest element in the
|
||||
// slice. Partition the slice into elements equal to and elements greater than the pivot.
|
||||
// This case is usually hit when the slice contains many duplicate elements.
|
||||
if let Some(p) = ancestor_pivot {
|
||||
// SAFETY: We assume choose_pivot yields an in-bounds position.
|
||||
if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
|
||||
let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
|
||||
|
||||
// Continue sorting elements greater than the pivot. We know that `num_lt` contains
|
||||
// the pivot. So we can continue after `num_lt`.
|
||||
v = &mut v[(num_lt + 1)..];
|
||||
ancestor_pivot = None;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Partition the slice.
|
||||
let num_lt = partition(v, pivot_pos, is_less);
|
||||
// SAFETY: partition ensures that `num_lt` will be in-bounds.
|
||||
unsafe { intrinsics::assume(num_lt < v.len()) };
|
||||
|
||||
// Split the slice into `left`, `pivot`, and `right`.
|
||||
let (left, right) = v.split_at_mut(num_lt);
|
||||
let (pivot, right) = right.split_at_mut(1);
|
||||
let pivot = &pivot[0];
|
||||
|
||||
// Recurse into the left side. We have a fixed recursion limit, testing shows no real
|
||||
// benefit for recursing into the shorter side.
|
||||
quicksort(left, ancestor_pivot, limit, is_less);
|
||||
|
||||
// Continue with the right side.
|
||||
v = right;
|
||||
ancestor_pivot = Some(pivot);
|
||||
}
|
||||
}
|
||||
|
||||
/// Takes the input slice `v` and re-arranges elements such that when the call returns normally
|
||||
/// all elements that compare true for `is_less(elem, pivot)` where `pivot == v[pivot_pos]` are
|
||||
/// on the left side of `v` followed by the other elements, notionally considered greater or
|
||||
/// equal to `pivot`.
|
||||
///
|
||||
/// Returns the number of elements that are compared true for `is_less(elem, pivot)`.
|
||||
///
|
||||
/// If `is_less` does not implement a total order the resulting order and return value are
|
||||
/// unspecified. All original elements will remain in `v` and any possible modifications via
|
||||
/// interior mutability will be observable. Same is true if `is_less` panics or `v.len()`
|
||||
/// exceeds `scratch.len()`.
|
||||
pub(crate) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
// Allows for panic-free code-gen by proving this property to the compiler.
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Allows for panic-free code-gen by proving this property to the compiler.
|
||||
if pivot >= len {
|
||||
intrinsics::abort();
|
||||
}
|
||||
|
||||
// Place the pivot at the beginning of slice.
|
||||
v.swap(0, pivot);
|
||||
let (pivot, v_without_pivot) = v.split_at_mut(1);
|
||||
|
||||
// Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
|
||||
// signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
|
||||
// Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
|
||||
// into a stack value, but this creates issues for types with interior mutability mandating
|
||||
// a drop guard.
|
||||
let pivot = &mut pivot[0];
|
||||
|
||||
// This construct is used to limit the LLVM IR generated, which saves large amounts of
|
||||
// compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
|
||||
let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
|
||||
|
||||
// Place the pivot between the two partitions.
|
||||
v.swap(0, num_lt);
|
||||
|
||||
num_lt
|
||||
}
|
||||
|
||||
const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut F) -> usize {
|
||||
const MAX_BRANCHLESS_PARTITION_SIZE: usize = 96;
|
||||
if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
|
||||
// Specialize for types that are relatively cheap to copy, where branchless optimizations
|
||||
// have large leverage e.g. `u64` and `String`.
|
||||
partition_lomuto_branchless_cyclic::<T, F>
|
||||
} else {
|
||||
partition_hoare_branchy_cyclic::<T, F>
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`partition`].
|
||||
fn partition_hoare_branchy_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
let len = v.len();
|
||||
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Optimized for large types that are expensive to move. Not optimized for integers. Optimized
|
||||
// for small code-gen, assuming that is_less is an expensive operation that generates
|
||||
// substantial amounts of code or a call. And that copying elements will likely be a call to
|
||||
// memcpy. Using 2 `ptr::copy_nonoverlapping` has the chance to be faster than
|
||||
// `ptr::swap_nonoverlapping` because `memcpy` can use wide SIMD based on runtime feature
|
||||
// detection. Benchmarks support this analysis.
|
||||
|
||||
let mut gap_opt: Option<GapGuard<T>> = None;
|
||||
|
||||
// SAFETY: The left-to-right scanning loop performs a bounds check, where we know that `left >=
|
||||
// v_base && left < right && right <= v_base.add(len)`. The right-to-left scanning loop performs
|
||||
// a bounds check ensuring that `right` is in-bounds. We checked that `len` is more than zero,
|
||||
// which means that unconditional `right = right.sub(1)` is safe to do. The exit check makes
|
||||
// sure that `left` and `right` never alias, making `ptr::copy_nonoverlapping` safe. The
|
||||
// drop-guard `gap` ensures that should `is_less` panic we always overwrite the duplicate in the
|
||||
// input. `gap.pos` stores the previous value of `right` and starts at `right` and so it too is
|
||||
// in-bounds. We never pass the saved `gap.value` to `is_less` while it is inside the `GapGuard`
|
||||
// thus any changes via interior mutability will be observed.
|
||||
unsafe {
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
let mut left = v_base;
|
||||
let mut right = v_base.add(len);
|
||||
|
||||
loop {
|
||||
// Find the first element greater than the pivot.
|
||||
while left < right && is_less(&*left, pivot) {
|
||||
left = left.add(1);
|
||||
}
|
||||
|
||||
// Find the last element equal to the pivot.
|
||||
loop {
|
||||
right = right.sub(1);
|
||||
if left >= right || is_less(&*right, pivot) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if left >= right {
|
||||
break;
|
||||
}
|
||||
|
||||
// Swap the found pair of out-of-order elements via cyclic permutation.
|
||||
let is_first_swap_pair = gap_opt.is_none();
|
||||
|
||||
if is_first_swap_pair {
|
||||
gap_opt = Some(GapGuard { pos: right, value: ManuallyDrop::new(ptr::read(left)) });
|
||||
}
|
||||
|
||||
let gap = gap_opt.as_mut().unwrap_unchecked();
|
||||
|
||||
// Single place where we instantiate ptr::copy_nonoverlapping in the partition.
|
||||
if !is_first_swap_pair {
|
||||
ptr::copy_nonoverlapping(left, gap.pos, 1);
|
||||
}
|
||||
gap.pos = right;
|
||||
ptr::copy_nonoverlapping(right, left, 1);
|
||||
|
||||
left = left.add(1);
|
||||
}
|
||||
|
||||
left.sub_ptr(v_base)
|
||||
|
||||
// `gap_opt` goes out of scope and overwrites the last wrong-side element on the right side
|
||||
// with the first wrong-side element of the left side that was initially overwritten by the
|
||||
// first wrong-side element on the right side element.
|
||||
}
|
||||
}
|
||||
|
||||
struct PartitionState<T> {
|
||||
// The current element that is being looked at, scans left to right through slice.
|
||||
right: *mut T,
|
||||
// Counts the number of elements that compared less-than, also works around:
|
||||
// https://github.com/rust-lang/rust/issues/117128
|
||||
num_lt: usize,
|
||||
// Gap guard that tracks the temporary duplicate in the input.
|
||||
gap: GapGuardRaw<T>,
|
||||
}
|
||||
|
||||
fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
|
||||
where
|
||||
F: FnMut(&T, &T) -> bool,
|
||||
{
|
||||
// Novel partition implementation by Lukas Bergdoll and Orson Peters. Branchless Lomuto
|
||||
// partition paired with a cyclic permutation.
|
||||
// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md
|
||||
|
||||
let len = v.len();
|
||||
let v_base = v.as_mut_ptr();
|
||||
|
||||
if len == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SAFETY: We checked that `len` is more than zero, which means that reading `v_base` is safe to
|
||||
// do. From there we have a bounded loop where `v_base.add(i)` is guaranteed in-bounds. `v` and
|
||||
// `pivot` can't alias because of type system rules. The drop-guard `gap` ensures that should
|
||||
// `is_less` panic we always overwrite the duplicate in the input. `gap.pos` stores the previous
|
||||
// value of `right` and starts at `v_base` and so it too is in-bounds. Given `UNROLL_LEN == 2`
|
||||
// after the main loop we either have A) the last element in `v` that has not yet been processed
|
||||
// because `len % 2 != 0`, or B) all elements have been processed except the gap value that was
|
||||
// saved at the beginning with `ptr::read(v_base)`. In the case A) the loop will iterate twice,
|
||||
// first performing loop_body to take care of the last element that didn't fit into the unroll.
|
||||
// After that the behavior is the same as for B) where we use the saved value as `right` to
|
||||
// overwrite the duplicate. If this very last call to `is_less` panics the saved value will be
|
||||
// copied back including all possible changes via interior mutability. If `is_less` does not
|
||||
// panic and the code continues we overwrite the duplicate and do `right = right.add(1)`, this
|
||||
// is safe to do with `&mut *gap.value` because `T` is the same as `[T; 1]` and generating a
|
||||
// pointer one past the allocation is safe.
|
||||
unsafe {
|
||||
let mut loop_body = |state: &mut PartitionState<T>| {
|
||||
let right_is_lt = is_less(&*state.right, pivot);
|
||||
let left = v_base.add(state.num_lt);
|
||||
|
||||
ptr::copy(left, state.gap.pos, 1);
|
||||
ptr::copy_nonoverlapping(state.right, left, 1);
|
||||
|
||||
state.gap.pos = state.right;
|
||||
state.num_lt += right_is_lt as usize;
|
||||
|
||||
state.right = state.right.add(1);
|
||||
};
|
||||
|
||||
// Ideally we could just use GapGuard in PartitionState, but the reference that is
|
||||
// materialized with `&mut state` when calling `loop_body` would create a mutable reference
|
||||
// to the parent struct that contains the gap value, invalidating the reference pointer
|
||||
// created from a reference to the gap value in the cleanup loop. This is only an issue
|
||||
// under Stacked Borrows, Tree Borrows accepts the intuitive code using GapGuard as valid.
|
||||
let mut gap_value = ManuallyDrop::new(ptr::read(v_base));
|
||||
|
||||
let mut state = PartitionState {
|
||||
num_lt: 0,
|
||||
right: v_base.add(1),
|
||||
|
||||
gap: GapGuardRaw { pos: v_base, value: &mut *gap_value },
|
||||
};
|
||||
|
||||
// Manual unrolling that works well on x86, Arm and with opt-level=s without murdering
|
||||
// compile-times. Leaving this to the compiler yields ok to bad results.
|
||||
let unroll_len = const { if mem::size_of::<T>() <= 16 { 2 } else { 1 } };
|
||||
|
||||
let unroll_end = v_base.add(len - (unroll_len - 1));
|
||||
while state.right < unroll_end {
|
||||
if unroll_len == 2 {
|
||||
loop_body(&mut state);
|
||||
loop_body(&mut state);
|
||||
} else {
|
||||
loop_body(&mut state);
|
||||
}
|
||||
}
|
||||
|
||||
// Single instantiate `loop_body` for both the unroll cleanup and cyclic permutation
|
||||
// cleanup. Optimizes binary-size and compile-time.
|
||||
let end = v_base.add(len);
|
||||
loop {
|
||||
let is_done = state.right == end;
|
||||
state.right = if is_done { state.gap.value } else { state.right };
|
||||
|
||||
loop_body(&mut state);
|
||||
|
||||
if is_done {
|
||||
mem::forget(state.gap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
state.num_lt
|
||||
}
|
||||
}
|
||||
|
||||
struct GapGuard<T> {
|
||||
pos: *mut T,
|
||||
value: ManuallyDrop<T>,
|
||||
}
|
||||
|
||||
impl<T> Drop for GapGuard<T> {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(&*self.value, self.pos, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Ideally this wouldn't be needed and we could just use the regular GapGuard.
|
||||
/// See comment in [`partition_lomuto_branchless_cyclic`].
|
||||
struct GapGuardRaw<T> {
|
||||
pos: *mut T,
|
||||
value: *mut T,
|
||||
}
|
||||
|
||||
impl<T> Drop for GapGuardRaw<T> {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(self.value, self.pos, 1);
|
||||
}
|
||||
}
|
||||
}
|
@ -49,7 +49,6 @@
|
||||
#![feature(is_sorted)]
|
||||
#![feature(layout_for_ptr)]
|
||||
#![feature(pattern)]
|
||||
#![feature(sort_internals)]
|
||||
#![feature(slice_take)]
|
||||
#![feature(slice_from_ptr_range)]
|
||||
#![feature(slice_split_once)]
|
||||
|
@ -1803,9 +1803,8 @@ fn brute_force_rotate_test_1() {
|
||||
#[test]
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
fn sort_unstable() {
|
||||
use core::cmp::Ordering::{Equal, Greater, Less};
|
||||
use core::slice::heapsort;
|
||||
use rand::{seq::SliceRandom, Rng};
|
||||
// use core::cmp::Ordering::{Equal, Greater, Less};
|
||||
use rand::Rng;
|
||||
|
||||
// Miri is too slow (but still need to `chain` to make the types match)
|
||||
let lens = if cfg!(miri) { (2..20).chain(0..0) } else { (2..25).chain(500..510) };
|
||||
@ -1839,31 +1838,10 @@ fn sort_unstable() {
|
||||
tmp.copy_from_slice(v);
|
||||
tmp.sort_unstable_by(|a, b| b.cmp(a));
|
||||
assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
|
||||
|
||||
// Test heapsort using `<` operator.
|
||||
tmp.copy_from_slice(v);
|
||||
heapsort(tmp, |a, b| a < b);
|
||||
assert!(tmp.windows(2).all(|w| w[0] <= w[1]));
|
||||
|
||||
// Test heapsort using `>` operator.
|
||||
tmp.copy_from_slice(v);
|
||||
heapsort(tmp, |a, b| a > b);
|
||||
assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort using a completely random comparison function.
|
||||
// This will reorder the elements *somehow*, but won't panic.
|
||||
for i in 0..v.len() {
|
||||
v[i] = i as i32;
|
||||
}
|
||||
v.sort_unstable_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
|
||||
v.sort_unstable();
|
||||
for i in 0..v.len() {
|
||||
assert_eq!(v[i], i as i32);
|
||||
}
|
||||
|
||||
// Should not panic.
|
||||
[0i32; 0].sort_unstable();
|
||||
[(); 10].sort_unstable();
|
||||
|
Loading…
Reference in New Issue
Block a user