diff --git a/library/core/src/slice/sort/select.rs b/library/core/src/slice/sort/select.rs index f6529f23bcb..3358c03d30a 100644 --- a/library/core/src/slice/sort/select.rs +++ b/library/core/src/slice/sort/select.rs @@ -7,6 +7,7 @@ //! better performance than one would get using heapsort as fallback. use crate::mem::{self, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; use crate::slice::sort::unstable::quicksort::partition; @@ -40,7 +41,13 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { - partition_at_index_loop(v, index, None, &mut is_less); + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + median_of_medians(v, &mut is_less, index); + } else { + partition_at_index_loop(v, index, None, &mut is_less); + } + } } let (left, right) = v.split_at_mut(index); @@ -53,6 +60,7 @@ where // most once, it doesn't make sense to use something more sophisticated than insertion-sort. const INSERTION_SORT_THRESHOLD: usize = 16; +#[cfg(not(feature = "optimize_for_size"))] fn partition_at_index_loop<'a, T, F>( mut v: &'a mut [T], mut index: usize, @@ -169,6 +177,7 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut if v.len() >= 2 { insertion_sort_shift_left(v, 1, is_less); } + return; } diff --git a/library/core/src/slice/sort/shared/mod.rs b/library/core/src/slice/sort/shared/mod.rs index ad1171bfc6a..e0f8d475a2e 100644 --- a/library/core/src/slice/sort/shared/mod.rs +++ b/library/core/src/slice/sort/shared/mod.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "optimize_for_size", allow(dead_code))] + use crate::marker::Freeze; pub(crate) mod pivot; diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index fae628a7c14..6adf779a72f 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -378,7 +378,12 @@ where /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the /// value at position `b_pos` is less than the one at position `a_pos`. -pub unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +/// +/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like +/// types. `is_less` could be a huge function and we want to give the compiler an option to +/// not inline this function. For the same reasons that this function is very perf critical +/// it should be in the same module as the functions that use it. +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { diff --git a/library/core/src/slice/sort/stable/mod.rs b/library/core/src/slice/sort/stable/mod.rs index c6f637b3d27..e13fbc37e80 100644 --- a/library/core/src/slice/sort/stable/mod.rs +++ b/library/core/src/slice/sort/stable/mod.rs @@ -1,15 +1,24 @@ //! This module contains the entry points for `slice::sort`. +#[cfg(not(feature = "optimize_for_size"))] +use crate::cmp; +use crate::intrinsics; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::{ SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left, }; -use crate::{cmp, intrinsics}; -pub(crate) mod drift; pub(crate) mod merge; + +#[cfg(not(feature = "optimize_for_size"))] +pub(crate) mod drift; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; +#[cfg(feature = "optimize_for_size")] +pub(crate) mod tiny; + /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. /// Design document: /// @@ -30,25 +39,53 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } + cfg_if! { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { + let alloc_len = len / 2; - driftsort_main::(v, is_less); + cfg_if! { + if #[cfg(target_pointer_width = "16")] { + let heap_buf = BufT::with_capacity(alloc_len); + let scratch = heap_buf.as_uninit_slice_mut(); + } else { + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + } + } + + tiny::mergesort(v, scratch, is_less); + } else { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { // By allocating n elements of memory we can ensure the entire input can diff --git a/library/core/src/slice/sort/stable/tiny.rs b/library/core/src/slice/sort/stable/tiny.rs new file mode 100644 index 00000000000..071ab8e107f --- /dev/null +++ b/library/core/src/slice/sort/stable/tiny.rs @@ -0,0 +1,41 @@ +//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. + +use crate::mem::MaybeUninit; +use crate::ptr; +use crate::slice::sort::stable::merge; + +/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever, +/// no run detection, etc. +#[inline(always)] +pub fn mergesort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + let len = v.len(); + + if len > 2 { + let mid = len / 2; + + // SAFETY: mid is in-bounds. + unsafe { + // Sort the left half recursively. + mergesort(v.get_unchecked_mut(..mid), scratch, is_less); + // Sort the right half recursively. + mergesort(v.get_unchecked_mut(mid..), scratch, is_less); + } + + merge::merge(v, scratch, mid, is_less); + } else if len == 2 { + // SAFETY: We checked the len, the pointers we create are valid and don't overlap. + unsafe { + let v_base = v.as_mut_ptr(); + let v_a = v_base; + let v_b = v_base.add(1); + + if is_less(&*v_b, &*v_a) { + ptr::swap_nonoverlapping(v_a, v_b, 1); + } + } + } +} diff --git a/library/core/src/slice/sort/unstable/heapsort.rs b/library/core/src/slice/sort/unstable/heapsort.rs index 27e2ad588ea..85231779d03 100644 --- a/library/core/src/slice/sort/unstable/heapsort.rs +++ b/library/core/src/slice/sort/unstable/heapsort.rs @@ -1,46 +1,46 @@ //! This module contains a branchless heapsort as fallback for unstable quicksort. -use crate::{intrinsics, ptr}; +use crate::{cmp, intrinsics, ptr}; /// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case. /// /// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic /// fallback. -/// -/// SAFETY: The caller has to guarantee that `v.len()` >= 2. #[inline(never)] -pub(crate) unsafe fn heapsort(v: &mut [T], is_less: &mut F) +pub(crate) fn heapsort(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, { - // SAFETY: See function safety. - unsafe { - intrinsics::assume(v.len() >= 2); + let len = v.len(); - // Build the heap in linear time. - for i in (0..v.len() / 2).rev() { - sift_down(v, i, is_less); - } - - // Pop maximal elements from the heap. - for i in (1..v.len()).rev() { + for i in (0..len + len / 2).rev() { + let sift_idx = if i >= len { + i - len + } else { v.swap(0, i); - sift_down(&mut v[..i], 0, is_less); + 0 + }; + + // SAFETY: The above calculation ensures that `sift_idx` is either 0 or + // `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`. + // This guarantees the required `sift_idx <= len`. + unsafe { + sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less); } } } // This binary heap respects the invariant `parent >= child`. // -// SAFETY: The caller has to guarantee that node < `v.len()`. -#[inline(never)] +// SAFETY: The caller has to guarantee that `node <= v.len()`. +#[inline(always)] unsafe fn sift_down(v: &mut [T], mut node: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { // SAFETY: See function safety. unsafe { - intrinsics::assume(node < v.len()); + intrinsics::assume(node <= v.len()); } let len = v.len(); @@ -69,9 +69,7 @@ where break; } - // Swap `node` with the greater child, move one step down, and continue sifting. This - // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size. - ptr::swap(v_base.add(node), v_base.add(child)); + ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1); } node = child; diff --git a/library/core/src/slice/sort/unstable/mod.rs b/library/core/src/slice/sort/unstable/mod.rs index 932e01f4401..8bbd85443d4 100644 --- a/library/core/src/slice/sort/unstable/mod.rs +++ b/library/core/src/slice/sort/unstable/mod.rs @@ -2,7 +2,9 @@ use crate::intrinsics; use crate::mem::SizedTypeProperties; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::find_existing_run; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; pub(crate) mod heapsort; @@ -28,25 +30,32 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } + cfg_if! { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { + heapsort::heapsort(v, is_less); + } else { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } - ipnsort(v, is_less); + ipnsort(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn ipnsort(v: &mut [T], is_less: &mut F) where diff --git a/library/core/src/slice/sort/unstable/quicksort.rs b/library/core/src/slice/sort/unstable/quicksort.rs index cd53656e9b4..4feef5deeb0 100644 --- a/library/core/src/slice/sort/unstable/quicksort.rs +++ b/library/core/src/slice/sort/unstable/quicksort.rs @@ -1,8 +1,12 @@ //! This module contains an unstable quicksort and two partition implementations. use crate::mem::{self, ManuallyDrop}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl; +#[cfg(not(feature = "optimize_for_size"))] +use crate::slice::sort::unstable::heapsort; use crate::{intrinsics, ptr}; /// Sorts `v` recursively. @@ -11,6 +15,7 @@ use crate::{intrinsics, ptr}; /// /// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero, /// this function will immediately switch to heapsort. +#[cfg(not(feature = "optimize_for_size"))] pub(crate) fn quicksort<'a, T, F>( mut v: &'a mut [T], mut ancestor_pivot: Option<&'a T>, @@ -28,10 +33,7 @@ pub(crate) fn quicksort<'a, T, F>( // If too many bad pivot choices were made, simply fall back to heapsort in order to // guarantee `O(N x log(N))` worst-case. if limit == 0 { - // SAFETY: We assume the `small_sort` threshold is at least 1. - unsafe { - crate::slice::sort::unstable::heapsort::heapsort(v, is_less); - } + heapsort::heapsort(v, is_less); return; } @@ -98,13 +100,15 @@ where return 0; } - // Allows for panic-free code-gen by proving this property to the compiler. if pivot >= len { intrinsics::abort(); } - // Place the pivot at the beginning of slice. - v.swap(0, pivot); + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } let (pivot, v_without_pivot) = v.split_at_mut(1); // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function @@ -118,8 +122,15 @@ where // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. let num_lt = (const { inst_partition::() })(v_without_pivot, pivot, is_less); - // Place the pivot between the two partitions. - v.swap(0, num_lt); + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } num_lt } @@ -129,7 +140,13 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { // Specialize for types that are relatively cheap to copy, where branchless optimizations // have large leverage e.g. `u64` and `String`. - partition_lomuto_branchless_cyclic:: + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + partition_lomuto_branchless_simple:: + } else { + partition_lomuto_branchless_cyclic:: + } + } } else { partition_hoare_branchy_cyclic:: } @@ -215,6 +232,7 @@ where } } +#[cfg(not(feature = "optimize_for_size"))] struct PartitionState { // The current element that is being looked at, scans left to right through slice. right: *mut T, @@ -225,6 +243,7 @@ struct PartitionState { gap: GapGuardRaw, } +#[cfg(not(feature = "optimize_for_size"))] fn partition_lomuto_branchless_cyclic(v: &mut [T], pivot: &T, is_less: &mut F) -> usize where F: FnMut(&T, &T) -> bool, @@ -316,6 +335,27 @@ where } } +#[cfg(feature = "optimize_for_size")] +fn partition_lomuto_branchless_simple bool>( + v: &mut [T], + pivot: &T, + is_less: &mut F, +) -> usize { + let mut left = 0; + + for right in 0..v.len() { + // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that + // left <= right and that both are in-bounds. + unsafe { + let right_is_lt = is_less(v.get_unchecked(right), pivot); + v.swap_unchecked(left, right); + left += right_is_lt as usize; + } + } + + left +} + struct GapGuard { pos: *mut T, value: ManuallyDrop, @@ -333,11 +373,13 @@ impl Drop for GapGuard { /// Ideally this wouldn't be needed and we could just use the regular GapGuard. /// See comment in [`partition_lomuto_branchless_cyclic`]. +#[cfg(not(feature = "optimize_for_size"))] struct GapGuardRaw { pos: *mut T, value: *mut T, } +#[cfg(not(feature = "optimize_for_size"))] impl Drop for GapGuardRaw { fn drop(&mut self) { // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into