Replace sort implementations

- `slice::sort` -> driftsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md - `slice::sort_unstable` -> ipnsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md Replaces the sort implementations with tailor made ones that strike a balance of run-time, compile-time and binary-size, yielding run-time and compile-time improvements. Regressing binary-size for `slice::sort` while improving it for `slice::sort_unstable`. All while upholding the existing soft and hard safety guarantees, and even extending the soft guarantees, detecting strict weak ordering violations with a high chance and reporting it to users via a panic. In addition the implementation of `select_nth_unstable` is also adapted as it uses `slice::sort_unstable` internals.
2025-02-20 10:55:14 +00:00 · 2024-04-16 20:24:21 +02:00 · 2024-04-16 20:24:21 +02:00 · e49be415cd
commit e49be415cd
parent 4a78c00e22
18 changed files with 2621 additions and 1688 deletions
--- a/library/alloc/src/slice.rs
+++ b/library/alloc/src/slice.rs
@ -16,7 +16,7 @@ use core::borrow::{Borrow, BorrowMut};
 #[cfg(not(no_global_oom_handling))]
 use core::cmp::Ordering::{self, Less};
 #[cfg(not(no_global_oom_handling))]
-use core::mem::{self, SizedTypeProperties};
+use core::mem::{self, MaybeUninit};
 #[cfg(not(no_global_oom_handling))]
 use core::ptr;
 #[cfg(not(no_global_oom_handling))]
@ -24,7 +24,7 @@ use core::slice::sort;

 use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
-use crate::alloc::{self, Global};
+use crate::alloc::Global;
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::ToOwned;
 use crate::boxed::Box;
@ -174,23 +174,32 @@ pub(crate) mod hack {

 #[cfg(not(test))]
 impl<T> [T] {
-    /// Sorts the slice.
+    /// Sorts the slice, preserving initial order of equal elements.
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
+    /// worst-case.
+    ///
+    /// If `T: Ord` does not implement a total order the resulting order is unspecified. All
+    /// original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `T: Ord` panics.
    ///
    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable`](slice::sort_unstable).
+    /// sorting and it doesn't allocate auxiliary memory. See
+    /// [`sort_unstable`](slice::sort_unstable). The exception are partially sorted slices, which
+    /// may be better served with `slice::sort`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -200,6 +209,8 @@ impl<T> [T] {
    /// v.sort();
    /// assert!(v == [-5, -3, 1, 2, 4]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "rust1", since = "1.0.0")]
@ -211,13 +222,18 @@ impl<T> [T] {
        stable_sort(self, T::lt);
    }

-    /// Sorts the slice with a comparator function.
+    /// Sorts the slice with a comparator function, preserving initial order of equal elements.
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*))
+    /// worst-case.
    ///
-    /// The comparator function must define a total ordering for the elements in the slice. If
-    /// the ordering is not total, the order of the elements is unspecified. An order is a
-    /// total order if it is (for all `a`, `b` and `c`):
+    /// The comparator function should define a total ordering for the elements in the slice. If the
+    /// ordering is not total, the order of the elements is unspecified.
+    ///
+    /// If the comparator function does not implement a total order the resulting order is
+    /// unspecified. All original elements will remain in the slice and any possible modifications
+    /// via interior mutability are observed in the input. Same is true if the comparator function
+    /// panics. A total order (for all `a`, `b` and `c`):
    ///
    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
@ -227,23 +243,22 @@ impl<T> [T] {
    ///
    /// ```
    /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
-    /// floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    /// floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
    /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
    /// ```
    ///
-    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable_by`](slice::sort_unstable_by).
-    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -256,6 +271,8 @@ impl<T> [T] {
    /// v.sort_by(|a, b| b.cmp(a));
    /// assert!(v == [5, 4, 3, 2, 1]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "rust1", since = "1.0.0")]
@ -267,28 +284,27 @@ impl<T> [T] {
        stable_sort(self, |a, b| compare(a, b) == Less);
    }

-    /// Sorts the slice with a key extraction function.
+    /// Sorts the slice with a key extraction function, preserving initial order of equal elements.
    ///
    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
    /// worst-case, where the key function is *O*(*m*).
    ///
-    /// For expensive key functions (e.g. functions that are not simple property accesses or
-    /// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be
-    /// significantly faster, as it does not recompute element keys.
-    ///
-    /// When applicable, unstable sorting is preferred because it is generally faster than stable
-    /// sorting and it doesn't allocate auxiliary memory.
-    /// See [`sort_unstable_by_key`](slice::sort_unstable_by_key).
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an adaptive, iterative merge sort inspired by
-    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
-    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
-    /// two or more sorted sequences concatenated one after another.
+    /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which
+    /// combines the fast average case of quicksort with the fast worst case and partial run
+    /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs
+    /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
-    /// non-allocating insertion sort is used instead.
+    /// The auxiliary memory allocation behavior depends on the input length. Short slices are
+    /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it
+    /// clamps at `self.len() / 2`.
+    ///
+    /// If `K: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -298,6 +314,8 @@ impl<T> [T] {
    /// v.sort_by_key(|k| k.abs());
    /// assert!(v == [1, 2, -3, 4, -5]);
    /// ```
+    ///
+    /// [driftsort]: https://github.com/Voultapher/driftsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "slice_sort_by_key", since = "1.7.0")]
@ -310,27 +328,30 @@ impl<T> [T] {
        stable_sort(self, |a, b| f(a).lt(&f(b)));
    }

-    /// Sorts the slice with a key extraction function.
+    /// Sorts the slice with a key extraction function, preserving initial order of equal elements.
    ///
-    /// During sorting, the key function is called at most once per element, by using
-    /// temporary storage to remember the results of key evaluation.
-    /// The order of calls to the key function is unspecified and may change in future versions
-    /// of the standard library.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \*
+    /// log(*n*)) worst-case, where the key function is *O*(*m*).
    ///
-    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
-    /// worst-case, where the key function is *O*(*m*).
+    /// During sorting, the key function is called at most once per element, by using temporary
+    /// storage to remember the results of key evaluation. The order of calls to the key function is
+    /// unspecified and may change in future versions of the standard library.
    ///
-    /// For simple key functions (e.g., functions that are property accesses or
-    /// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be
-    /// faster.
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
+    ///
+    /// For simple key functions (e.g., functions that are property accesses or basic operations),
+    /// [`sort_by_key`](slice::sort_by_key) is likely to be faster.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [instruction-parallel-network sort][ipnsort] by Lukas
+    /// Bergdoll, which combines the fast average case of randomized quicksort with the fast worst
+    /// case of heapsort, while achieving linear time on fully sorted and reversed inputs. And
+    /// *O*(*k* \* log(*n*)) where *k* is the number of distinct elements in the input. It leverages
+    /// superscalar out-of-order execution capabilities commonly found in CPUs, to efficiently
+    /// perform the operation.
    ///
    /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
    /// length of the slice.
@ -344,7 +365,7 @@ impl<T> [T] {
    /// assert!(v == [-3, -5, 2, 32, 4]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[cfg(not(no_global_oom_handling))]
    #[rustc_allow_incoherent_impl]
    #[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")]
@ -361,7 +382,7 @@ impl<T> [T] {
                    $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
                // The elements of `indices` are unique, as they are indexed, so any sort will be
                // stable with respect to the original slice. We use `sort_unstable` here because
-                // it requires less memory allocation.
+                // it requires no memory allocation.
                indices.sort_unstable();
                for i in 0..$slice.len() {
                    let mut index = indices[i].1;
@ -374,24 +395,24 @@ impl<T> [T] {
            }};
        }

-        let sz_u8 = mem::size_of::<(K, u8)>();
-        let sz_u16 = mem::size_of::<(K, u16)>();
-        let sz_u32 = mem::size_of::<(K, u32)>();
-        let sz_usize = mem::size_of::<(K, usize)>();
-
        let len = self.len();
        if len < 2 {
            return;
        }
-        if sz_u8 < sz_u16 && len <= (u8::MAX as usize) {
-            return sort_by_key!(u8, self, f);
-        }
-        if sz_u16 < sz_u32 && len <= (u16::MAX as usize) {
-            return sort_by_key!(u16, self, f);
-        }
-        if sz_u32 < sz_usize && len <= (u32::MAX as usize) {
+
+        // Avoids binary-size usage in cases where the alignment doesn't work out to make this
+        // beneficial or on 32-bit platforms.
+        let is_using_u32_as_idx_type_helpful =
+            const { mem::size_of::<(K, u32)>() < mem::size_of::<(K, usize)>() };
+
+        // It's possible to instantiate this for u8 and u16 but, doing so is very wasteful in terms
+        // of compile-times and binary-size, the peak saved heap memory for u16 is (u8 + u16) -> 4
+        // bytes * u16::MAX vs (u8 + u32) -> 8 bytes * u16::MAX, the saved heap memory is at peak
+        // ~262KB.
+        if is_using_u32_as_idx_type_helpful && len <= (u32::MAX as usize) {
            return sort_by_key!(u32, self, f);
        }
+
        sort_by_key!(usize, self, f)
    }

@ -843,46 +864,18 @@ fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
 where
    F: FnMut(&T, &T) -> bool,
 {
-    if T::IS_ZST {
-        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
-        return;
+    use sort::stable::BufGuard;
+
+    #[unstable(issue = "none", feature = "std_internals")]
+    impl<T> BufGuard<T> for Vec<T> {
+        fn with_capacity(capacity: usize) -> Self {
+            Vec::with_capacity(capacity)
+        }
+
+        fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
+            self.spare_capacity_mut()
+        }
    }

-    let elem_alloc_fn = |len: usize| -> *mut T {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
-        // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
-        // elements.
-        unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
-    };
-
-    let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
-        // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-        // len.
-        unsafe {
-            alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
-        }
-    };
-
-    let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
-        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
-        // obscene length or 0.
-        unsafe {
-            alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
-                as *mut sort::TimSortRun
-        }
-    };
-
-    let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
-        // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-        // len.
-        unsafe {
-            alloc::dealloc(
-                buf_ptr as *mut u8,
-                alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
-            );
-        }
-    };
-
-    sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
+    sort::stable::sort::<T, F, Vec<T>>(v, &mut is_less);
 }
--- a/library/alloc/src/slice/tests.rs
+++ b/library/alloc/src/slice/tests.rs
@ -34,7 +34,7 @@ macro_rules! do_test {
            }

            let v = $input.to_owned();
-            let _ = std::panic::catch_unwind(move || {
+            let _ = panic::catch_unwind(move || {
                let mut v = v;
                let mut panic_countdown = panic_countdown;
                v.$func(|a, b| {
@ -197,8 +197,7 @@ fn panic_safe() {

    let mut rng = test_rng();

-    // Miri is too slow (but still need to `chain` to make the types match)
-    let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) };
+    let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) };
    let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] };

    for len in lens {
@ -294,15 +293,20 @@ fn test_sort() {
        }
    }

-    // Sort using a completely random comparison function.
-    // This will reorder the elements *somehow*, but won't panic.
-    let mut v = [0; 500];
-    for i in 0..v.len() {
+    const ORD_VIOLATION_MAX_LEN: usize = 500;
+    let mut v = [0; ORD_VIOLATION_MAX_LEN];
+    for i in 0..ORD_VIOLATION_MAX_LEN {
        v[i] = i as i32;
    }
-    v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
+
+    // Sort using a completely random comparison function. This will reorder the elements *somehow*,
+    // it may panic but the original elements must still be present.
+    let _ = panic::catch_unwind(move || {
+        v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
+    });
+
    v.sort();
-    for i in 0..v.len() {
+    for i in 0..ORD_VIOLATION_MAX_LEN {
        assert_eq!(v[i], i as i32);
    }

--- a/library/core/src/slice/mod.rs
+++ b/library/core/src/slice/mod.rs
@ -39,7 +39,6 @@ pub(crate) mod index;
 mod iter;
 mod raw;
 mod rotate;
-mod select;
 mod specialize;

 #[unstable(feature = "str_internals", issue = "none")]
@ -83,10 +82,6 @@ pub use raw::{from_mut, from_ref};
 #[unstable(feature = "slice_from_ptr_range", issue = "89792")]
 pub use raw::{from_mut_ptr_range, from_ptr_range};

-// This function is public only because there is no other way to unit test heapsort.
-#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")]
-pub use sort::heapsort;
-
 #[stable(feature = "slice_get_slice", since = "1.28.0")]
 pub use index::SliceIndex;

@ -2884,21 +2879,26 @@ impl<T> [T] {
        self.binary_search_by(|k| f(k).cmp(b))
    }

-    /// Sorts the slice, but might not preserve the order of equal elements.
+    /// Sorts the slice **without** preserving the initial order of equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// If `T: Ord` does not implement a total order the resulting order is unspecified. All
+    /// original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `T: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
-    /// slice consists of several concatenated sorted sequences.
+    /// slice is partially sorted.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -2909,25 +2909,29 @@ impl<T> [T] {
    /// assert!(v == [-5, -3, 1, 2, 4]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable(&mut self)
    where
        T: Ord,
    {
-        sort::quicksort(self, T::lt);
+        sort::unstable::sort(self, &mut T::lt);
    }

-    /// Sorts the slice with a comparator function, but might not preserve the order of equal
-    /// elements.
+    /// Sorts the slice with a comparator function, **without** preserving the initial order of
+    /// equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
    ///
-    /// The comparator function must define a total ordering for the elements in the slice. If
-    /// the ordering is not total, the order of the elements is unspecified. An order is a
-    /// total order if it is (for all `a`, `b` and `c`):
+    /// The comparator function should define a total ordering for the elements in the slice. If the
+    /// ordering is not total, the order of the elements is unspecified.
+    ///
+    /// If the comparator function does not implement a total order the resulting order is
+    /// unspecified. All original elements will remain in the slice and any possible modifications
+    /// via interior mutability are observed in the input. Same is true if the comparator function
+    /// panics. A total order (for all `a`, `b` and `c`):
    ///
    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
@ -2943,14 +2947,15 @@ impl<T> [T] {
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
-    /// slice consists of several concatenated sorted sequences.
+    /// slice is partially sorted.
+    ///
+    /// If `T: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -2964,34 +2969,37 @@ impl<T> [T] {
    /// assert!(v == [5, 4, 3, 2, 1]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable_by<F>(&mut self, mut compare: F)
    where
        F: FnMut(&T, &T) -> Ordering,
    {
-        sort::quicksort(self, |a, b| compare(a, b) == Ordering::Less);
+        sort::unstable::sort(self, &mut |a, b| compare(a, b) == Ordering::Less);
    }

-    /// Sorts the slice with a key extraction function, but might not preserve the order of equal
-    /// elements.
+    /// Sorts the slice with a key extraction function, **without** preserving the initial order of
+    /// equal elements.
    ///
-    /// This sort is unstable (i.e., may reorder equal elements), in-place
-    /// (i.e., does not allocate), and *O*(*m* \* *n* \* log(*n*)) worst-case, where the key function is
-    /// *O*(*m*).
+    /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not
+    /// allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// If `K: Ord` does not implement a total order the resulting order is unspecified.
+    /// All original elements will remain in the slice and any possible modifications via interior
+    /// mutability are observed in the input. Same is true if `K: Ord` panics.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
-    /// which combines the fast average case of randomized quicksort with the fast worst case of
-    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
-    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
-    /// deterministic behavior.
+    /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which
+    /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving
+    /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the
+    /// expected time to sort the data is *O(*n* log(*k*))*.
    ///
-    /// Due to its key calling strategy, [`sort_unstable_by_key`](#method.sort_unstable_by_key)
-    /// is likely to be slower than [`sort_by_cached_key`](#method.sort_by_cached_key) in
-    /// cases where the key function is expensive.
+    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
+    /// slice is partially sorted.
+    ///
+    /// If `K: Ord` does not implement a total order, the implementation may panic.
    ///
    /// # Examples
    ///
@ -3002,7 +3010,7 @@ impl<T> [T] {
    /// assert!(v == [1, 2, -3, 4, -5]);
    /// ```
    ///
-    /// [pdqsort]: https://github.com/orlp/pdqsort
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "sort_unstable", since = "1.20.0")]
    #[inline]
    pub fn sort_unstable_by_key<K, F>(&mut self, mut f: F)
@ -3010,27 +3018,32 @@ impl<T> [T] {
        F: FnMut(&T) -> K,
        K: Ord,
    {
-        sort::quicksort(self, |a, b| f(a).lt(&f(b)));
+        sort::unstable::sort(self, &mut |a, b| f(a).lt(&f(b)));
    }

-    /// Reorder the slice such that the element at `index` after the reordering is at its final sorted position.
+    /// Reorder the slice such that the element at `index` after the reordering is at its final
+    /// sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index`. Additionally, this reordering is
-    /// unstable (i.e. any number of equal elements may end up at position `index`), in-place
-    /// (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// unstable (i.e. any number of equal elements may end up at position `index`), in-place (i.e.
+    /// does not allocate), and runs in *O*(*n*) time. This function is also known as "kth element"
+    /// in other libraries.
    ///
-    /// It returns a triplet of the following from the reordered slice:
-    /// the subslice prior to `index`, the element at `index`, and the subslice after `index`;
-    /// accordingly, the values in those two subslices will respectively all be less-than-or-equal-to
-    /// and greater-than-or-equal-to the value of the element at `index`.
+    /// It returns a triplet of the following from the reordered slice: the subslice prior to
+    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
+    /// those two subslices will respectively all be less-than-or-equal-to and
+    /// greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@ -3058,35 +3071,40 @@ impl<T> [T] {
    ///         v == [-3, -5, 1, 4, 2] ||
    ///         v == [-5, -3, 1, 4, 2]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [T])
    where
        T: Ord,
    {
-        select::partition_at_index(self, index, T::lt)
+        sort::select::partition_at_index(self, index, T::lt)
    }

-    /// Reorder the slice with a comparator function such that the element at `index` after the reordering is at
-    /// its final sorted position.
+    /// Reorder the slice with a comparator function such that the element at `index` after the
+    /// reordering is at its final sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index` using the comparator function.
    /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
-    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
+    /// function is also known as "kth element" in other libraries.
    ///
-    /// It returns a triplet of the following from
-    /// the slice reordered according to the provided comparator function: the subslice prior to
-    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
-    /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
-    /// the value of the element at `index`.
+    /// It returns a triplet of the following from the slice reordered according to the provided
+    /// comparator function: the subslice prior to `index`, the element at `index`, and the subslice
+    /// after `index`; accordingly, the values in those two subslices will respectively all be
+    /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@ -3114,6 +3132,8 @@ impl<T> [T] {
    ///         v == [4, 2, 1, -5, -3] ||
    ///         v == [4, 2, 1, -3, -5]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable_by<F>(
@ -3124,29 +3144,32 @@ impl<T> [T] {
    where
        F: FnMut(&T, &T) -> Ordering,
    {
-        select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
+        sort::select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less)
    }

-    /// Reorder the slice with a key extraction function such that the element at `index` after the reordering is
-    /// at its final sorted position.
+    /// Reorder the slice with a key extraction function such that the element at `index` after the
+    /// reordering is at its final sorted position.
    ///
    /// This reordering has the additional property that any value at position `i < index` will be
    /// less than or equal to any value at a position `j > index` using the key extraction function.
    /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
-    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time.
-    /// This function is also known as "kth element" in other libraries.
+    /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This
+    /// function is also known as "kth element" in other libraries.
    ///
-    /// It returns a triplet of the following from
-    /// the slice reordered according to the provided key extraction function: the subslice prior to
-    /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in
-    /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
-    /// the value of the element at `index`.
+    /// It returns a triplet of the following from the slice reordered according to the provided key
+    /// extraction function: the subslice prior to `index`, the element at `index`, and the subslice
+    /// after `index`; accordingly, the values in those two subslices will respectively all be
+    /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`.
    ///
    /// # Current implementation
    ///
-    /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
-    /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for
-    /// pivot selection, which guarantees linear runtime for all inputs.
+    /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll
+    /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is
+    /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime
+    /// for all inputs.
+    ///
+    /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is
+    /// nearly fully sorted, where [`slice::sort`] may be faster.
    ///
    /// [`sort_unstable`]: slice::sort_unstable
    ///
@ -3174,6 +3197,8 @@ impl<T> [T] {
    ///         v == [2, 1, -3, 4, -5] ||
    ///         v == [2, 1, -3, -5, 4]);
    /// ```
+    ///
+    /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort
    #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")]
    #[inline]
    pub fn select_nth_unstable_by_key<K, F>(
@ -3185,7 +3210,7 @@ impl<T> [T] {
        F: FnMut(&T) -> K,
        K: Ord,
    {
-        select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
+        sort::select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b)))
    }

    /// Moves all consecutive repeated elements to the end of the slice according to the
--- a/library/core/src/slice/sort.rs
+++ b/library/core/src/slice/sort.rs
--- a/library/core/src/slice/sort/mod.rs
+++ b/library/core/src/slice/sort/mod.rs
@ -0,0 +1,8 @@
+//! This module and the contained sub-modules contains the code for efficient and robust sort
+//! implementations, as well as the domain adjacent implementation of `select_nth_unstable`.
+
+pub mod stable;
+pub mod unstable;
+
+pub(crate) mod select;
+pub(crate) mod shared;
--- a/library/core/src/slice/sort/select.rs
+++ b/library/core/src/slice/sort/select.rs
@ -1,45 +1,78 @@
-//! Slice selection
-//!
 //! This module contains the implementation for `slice::select_nth_unstable`.
-//! It uses an introselect algorithm based on Orson Peters' pattern-defeating quicksort,
-//! published at: <https://github.com/orlp/pdqsort>
+//! It uses an introselect algorithm based on ipnsort by Lukas Bergdoll and Orson Peters,
+//! published at: <https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort>
 //!
 //! The fallback algorithm used for introselect is Median of Medians using Tukey's Ninther
 //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
 //! better performance than one would get using heapsort as fallback.

-use crate::cmp;
 use crate::mem::{self, SizedTypeProperties};
-use crate::slice::sort::{
-    break_patterns, choose_pivot, insertion_sort_shift_left, partition, partition_equal,
-};

-// For slices of up to this length it's probably faster to simply sort them.
-// Defined at the module scope because it's used in multiple functions.
-const MAX_INSERTION: usize = 10;
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+use crate::slice::sort::unstable::quicksort::partition;
+
+/// Reorder the slice such that the element at `index` is at its final sorted position.
+pub(crate) fn partition_at_index<T, F>(
+    v: &mut [T],
+    index: usize,
+    mut is_less: F,
+) -> (&mut [T], &mut T, &mut [T])
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Puts a lower limit of 1 on `len`.
+    if index >= len {
+        panic!("partition_at_index index {} greater than length of slice {}", index, len);
+    }
+
+    if T::IS_ZST {
+        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
+    } else if index == len - 1 {
+        // Find max element and place it in the last position of the array. We're free to use
+        // `unwrap()` here because we checked that `v` is not empty.
+        let max_idx = max_index(v, &mut is_less).unwrap();
+        v.swap(max_idx, index);
+    } else if index == 0 {
+        // Find min element and place it in the first position of the array. We're free to use
+        // `unwrap()` here because we checked that `v` is not empty.
+        let min_idx = min_index(v, &mut is_less).unwrap();
+        v.swap(min_idx, index);
+    } else {
+        partition_at_index_loop(v, index, None, &mut is_less);
+    }
+
+    let (left, right) = v.split_at_mut(index);
+    let (pivot, right) = right.split_at_mut(1);
+    let pivot = &mut pivot[0];
+    (left, pivot, right)
+}
+
+// For small sub-slices it's faster to use a dedicated small-sort, but because it is only called at
+// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
+const INSERTION_SORT_THRESHOLD: usize = 16;

 fn partition_at_index_loop<'a, T, F>(
    mut v: &'a mut [T],
    mut index: usize,
+    mut ancestor_pivot: Option<&'a T>,
    is_less: &mut F,
-    mut pred: Option<&'a T>,
 ) where
    F: FnMut(&T, &T) -> bool,
 {
-    // Limit the amount of iterations and fall back to fast deterministic selection
-    // to ensure O(n) worst case running time. This limit needs to be constant, because
-    // using `ilog2(len)` like in `sort` would result in O(n log n) time complexity.
-    // The exact value of the limit is chosen somewhat arbitrarily, but for most inputs bad pivot
-    // selections should be relatively rare, so the limit usually shouldn't be reached
-    // anyways.
+    // Limit the amount of iterations and fall back to fast deterministic selection to ensure O(n)
+    // worst case running time. This limit needs to be constant, because using `ilog2(len)` like in
+    // `sort` would result in O(n log n) time complexity. The exact value of the limit is chosen
+    // somewhat arbitrarily, but for most inputs bad pivot selections should be relatively rare, so
+    // the limit is reached for sub-slices len / (2^limit or less). Which makes the remaining work
+    // with the fallback minimal in relative terms.
    let mut limit = 16;

-    // True if the last partitioning was reasonably balanced.
-    let mut was_balanced = true;
-
    loop {
-        if v.len() <= MAX_INSERTION {
-            if v.len() > 1 {
+        if v.len() <= INSERTION_SORT_THRESHOLD {
+            if v.len() >= 2 {
                insertion_sort_shift_left(v, 1, is_less);
            }
            return;
@ -50,38 +83,35 @@ fn partition_at_index_loop<'a, T, F>(
            return;
        }

-        // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
-        // some elements around. Hopefully we'll choose a better pivot this time.
-        if !was_balanced {
-            break_patterns(v);
-            limit -= 1;
-        }
+        limit -= 1;

        // Choose a pivot
-        let (pivot, _) = choose_pivot(v, is_less);
+        let pivot_pos = choose_pivot(v, is_less);

        // If the chosen pivot is equal to the predecessor, then it's the smallest element in the
        // slice. Partition the slice into elements equal to and elements greater than the pivot.
        // This case is usually hit when the slice contains many duplicate elements.
-        if let Some(p) = pred {
-            if !is_less(p, &v[pivot]) {
-                let mid = partition_equal(v, pivot, is_less);
+        if let Some(p) = ancestor_pivot {
+            if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
+                let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
+
+                // Continue sorting elements greater than the pivot. We know that `mid` contains
+                // the pivot. So we can continue after `mid`.
+                let mid = num_lt + 1;

                // If we've passed our index, then we're good.
                if mid > index {
                    return;
                }

-                // Otherwise, continue sorting elements greater than the pivot.
                v = &mut v[mid..];
                index = index - mid;
-                pred = None;
+                ancestor_pivot = None;
                continue;
            }
        }

-        let (mid, _) = partition(v, pivot, is_less);
-        was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8;
+        let mid = partition(v, pivot_pos, is_less);

        // Split the slice into `left`, `pivot`, and `right`.
        let (left, right) = v.split_at_mut(mid);
@ -91,7 +121,7 @@ fn partition_at_index_loop<'a, T, F>(
        if mid < index {
            v = right;
            index = index - mid - 1;
-            pred = Some(pivot);
+            ancestor_pivot = Some(pivot);
        } else if mid > index {
            v = left;
        } else {
@ -122,41 +152,6 @@ fn max_index<T, F: FnMut(&T, &T) -> bool>(slice: &[T], is_less: &mut F) -> Optio
        .map(|(i, _)| i)
 }

-/// Reorder the slice such that the element at `index` is at its final sorted position.
-pub fn partition_at_index<T, F>(
-    v: &mut [T],
-    index: usize,
-    mut is_less: F,
-) -> (&mut [T], &mut T, &mut [T])
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    if index >= v.len() {
-        panic!("partition_at_index index {} greater than length of slice {}", index, v.len());
-    }
-
-    if T::IS_ZST {
-        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
-    } else if index == v.len() - 1 {
-        // Find max element and place it in the last position of the array. We're free to use
-        // `unwrap()` here because we know v must not be empty.
-        let max_idx = max_index(v, &mut is_less).unwrap();
-        v.swap(max_idx, index);
-    } else if index == 0 {
-        // Find min element and place it in the first position of the array. We're free to use
-        // `unwrap()` here because we know v must not be empty.
-        let min_idx = min_index(v, &mut is_less).unwrap();
-        v.swap(min_idx, index);
-    } else {
-        partition_at_index_loop(v, index, &mut is_less, None);
-    }
-
-    let (left, right) = v.split_at_mut(index);
-    let (pivot, right) = right.split_at_mut(1);
-    let pivot = &mut pivot[0];
-    (left, pivot, right)
-}
-
 /// Selection algorithm to select the k-th element from the slice in guaranteed O(n) time.
 /// This is essentially a quickselect that uses Tukey's Ninther for pivot selection
 fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut F, mut k: usize) {
@ -168,8 +163,8 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut

    // We now know that `k < v.len() <= isize::MAX`
    loop {
-        if v.len() <= MAX_INSERTION {
-            if v.len() > 1 {
+        if v.len() <= INSERTION_SORT_THRESHOLD {
+            if v.len() >= 2 {
                insertion_sort_shift_left(v, 1, is_less);
            }
            return;
@ -232,7 +227,8 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
    }

    median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
-    partition(v, lo + pivot, is_less).0
+
+    partition(v, lo + pivot, is_less)
 }

 /// Moves around the 9 elements at the indices a..i, such that
--- a/library/core/src/slice/sort/shared/mod.rs
+++ b/library/core/src/slice/sort/shared/mod.rs
@ -0,0 +1,45 @@
+use crate::marker::Freeze;
+
+pub(crate) mod pivot;
+pub(crate) mod smallsort;
+
+/// SAFETY: this is safety relevant, how does this interact with the soundness holes in
+/// specialization?
+#[rustc_unsafe_specialization_marker]
+pub(crate) trait FreezeMarker {}
+
+impl<T: Freeze> FreezeMarker for T {}
+
+/// Finds a run of sorted elements starting at the beginning of the slice.
+///
+/// Returns the length of the run, and a bool that is false when the run
+/// is ascending, and true if the run strictly descending.
+#[inline(always)]
+pub(crate) fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(
+    v: &[T],
+    is_less: &mut F,
+) -> (usize, bool) {
+    let len = v.len();
+    if len < 2 {
+        return (len, false);
+    }
+
+    // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
+    // This also means that run_len < len implies run_len and run_len - 1
+    // are valid indices as well.
+    unsafe {
+        let mut run_len = 2;
+        let strictly_descending = is_less(v.get_unchecked(1), v.get_unchecked(0));
+        if strictly_descending {
+            while run_len < len && is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) {
+                run_len += 1;
+            }
+        } else {
+            while run_len < len && !is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1))
+            {
+                run_len += 1;
+            }
+        }
+        (run_len, strictly_descending)
+    }
+}
--- a/library/core/src/slice/sort/shared/pivot.rs
+++ b/library/core/src/slice/sort/shared/pivot.rs
@ -0,0 +1,88 @@
+//! This module contains the logic for pivot selection.
+
+use crate::intrinsics;
+
+// Recursively select a pseudomedian if above this threshold.
+const PSEUDO_MEDIAN_REC_THRESHOLD: usize = 64;
+
+/// Selects a pivot from `v`. Algorithm taken from glidesort by Orson Peters.
+///
+/// This chooses a pivot by sampling an adaptive amount of points, approximating
+/// the quality of a median of sqrt(n) elements.
+pub fn choose_pivot<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> usize {
+    // We use unsafe code and raw pointers here because we're dealing with
+    // heavy recursion. Passing safe slices around would involve a lot of
+    // branches and function call overhead.
+
+    let len = v.len();
+    if len < 8 {
+        intrinsics::abort();
+    }
+
+    // SAFETY: a, b, c point to initialized regions of len_div_8 elements,
+    // satisfying median3 and median3_rec's preconditions as v_base points
+    // to an initialized region of n = len elements.
+    unsafe {
+        let v_base = v.as_ptr();
+        let len_div_8 = len / 8;
+
+        let a = v_base; // [0, floor(n/8))
+        let b = v_base.add(len_div_8 * 4); // [4*floor(n/8), 5*floor(n/8))
+        let c = v_base.add(len_div_8 * 7); // [7*floor(n/8), 8*floor(n/8))
+
+        if len < PSEUDO_MEDIAN_REC_THRESHOLD {
+            median3(&*a, &*b, &*c, is_less).sub_ptr(v_base)
+        } else {
+            median3_rec(a, b, c, len_div_8, is_less).sub_ptr(v_base)
+        }
+    }
+}
+
+/// Calculates an approximate median of 3 elements from sections a, b, c, or
+/// recursively from an approximation of each, if they're large enough. By
+/// dividing the size of each section by 8 when recursing we have logarithmic
+/// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+/// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+///
+/// SAFETY: a, b, c must point to the start of initialized regions of memory of
+/// at least n elements.
+unsafe fn median3_rec<T, F: FnMut(&T, &T) -> bool>(
+    mut a: *const T,
+    mut b: *const T,
+    mut c: *const T,
+    n: usize,
+    is_less: &mut F,
+) -> *const T {
+    // SAFETY: a, b, c still point to initialized regions of n / 8 elements,
+    // by the exact same logic as in choose_pivot.
+    unsafe {
+        if n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD {
+            let n8 = n / 8;
+            a = median3_rec(a, a.add(n8 * 4), a.add(n8 * 7), n8, is_less);
+            b = median3_rec(b, b.add(n8 * 4), b.add(n8 * 7), n8, is_less);
+            c = median3_rec(c, c.add(n8 * 4), c.add(n8 * 7), n8, is_less);
+        }
+        median3(&*a, &*b, &*c, is_less)
+    }
+}
+
+/// Calculates the median of 3 elements.
+///
+/// SAFETY: a, b, c must be valid initialized elements.
+#[inline(always)]
+fn median3<T, F: FnMut(&T, &T) -> bool>(a: &T, b: &T, c: &T, is_less: &mut F) -> *const T {
+    // Compiler tends to make this branchless when sensible, and avoids the
+    // third comparison when not.
+    let x = is_less(a, b);
+    let y = is_less(a, c);
+    if x == y {
+        // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+        // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+        // By toggling the outcome of b < c using XOR x we get this behavior.
+        let z = is_less(b, c);
+        if z ^ x { c } else { b }
+    } else {
+        // Either c <= a < b or b <= a < c, thus a is our median.
+        a
+    }
+}
--- a/library/core/src/slice/sort/shared/smallsort.rs
+++ b/library/core/src/slice/sort/shared/smallsort.rs
@ -0,0 +1,843 @@
+//! This module contains a variety of sort implementations that are optimized for small lengths.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop, MaybeUninit};
+use crate::ptr;
+use crate::slice;
+
+use crate::slice::sort::shared::FreezeMarker;
+
+// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
+// small slices and small-sort performance sorting small sub-slices as part of
+// the main quicksort loop. For the former, testing showed that the
+// representative benchmarks for real-world performance are cold CPU state and
+// not single-size hot benchmarks. For the latter the CPU will call them many
+// times, so hot benchmarks are fine and more realistic. And it's worth it to
+// optimize sorting small sub-slices with more sophisticated solutions than
+// insertion sort.
+
+/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
+/// abstractions.
+pub(crate) trait StableSmallSortTypeImpl: Sized {
+    /// For which input length <= return value of this function, is it valid to call `small_sort`.
+    fn small_sort_threshold() -> usize;
+
+    /// Sorts `v` using strategies optimized for small sizes.
+    fn small_sort<F: FnMut(&Self, &Self) -> bool>(
+        v: &mut [Self],
+        scratch: &mut [MaybeUninit<Self>],
+        is_less: &mut F,
+    );
+}
+
+impl<T> StableSmallSortTypeImpl for T {
+    #[inline(always)]
+    default fn small_sort_threshold() -> usize {
+        // Optimal number of comparisons, and good perf.
+        SMALL_SORT_FALLBACK_THRESHOLD
+    }
+
+    #[inline(always)]
+    default fn small_sort<F: FnMut(&T, &T) -> bool>(
+        v: &mut [T],
+        _scratch: &mut [MaybeUninit<T>],
+        is_less: &mut F,
+    ) {
+        if v.len() >= 2 {
+            insertion_sort_shift_left(v, 1, is_less);
+        }
+    }
+}
+
+impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
+    #[inline(always)]
+    fn small_sort_threshold() -> usize {
+        SMALL_SORT_GENERAL_THRESHOLD
+    }
+
+    #[inline(always)]
+    fn small_sort<F: FnMut(&T, &T) -> bool>(
+        v: &mut [T],
+        scratch: &mut [MaybeUninit<T>],
+        is_less: &mut F,
+    ) {
+        small_sort_general_with_scratch(v, scratch, is_less);
+    }
+}
+
+/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
+/// abstractions.
+pub(crate) trait UnstableSmallSortTypeImpl: Sized {
+    /// For which input length <= return value of this function, is it valid to call `small_sort`.
+    fn small_sort_threshold() -> usize;
+
+    /// Sorts `v` using strategies optimized for small sizes.
+    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
+}
+
+impl<T> UnstableSmallSortTypeImpl for T {
+    #[inline(always)]
+    default fn small_sort_threshold() -> usize {
+        SMALL_SORT_FALLBACK_THRESHOLD
+    }
+
+    #[inline(always)]
+    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
+    where
+        F: FnMut(&T, &T) -> bool,
+    {
+        small_sort_fallback(v, is_less);
+    }
+}
+
+impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
+    #[inline(always)]
+    fn small_sort_threshold() -> usize {
+        match const { choose_unstable_small_sort::<T>() } {
+            UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD,
+            UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD,
+            UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD,
+        }
+    }
+
+    #[inline(always)]
+    fn small_sort<F>(v: &mut [T], is_less: &mut F)
+    where
+        F: FnMut(&T, &T) -> bool,
+    {
+        // This construct is used to limit the LLVM IR generated, which saves large amounts of
+        // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
+        (const { inst_unstable_small_sort::<T, F>() })(v, is_less);
+    }
+}
+
+/// Optimal number of comparisons, and good perf.
+const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
+
+/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
+/// wall-clock performance choosing 32 yielded better performance overall.
+///
+/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
+const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
+
+/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
+/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
+/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
+/// scratch buffer size.
+///
+/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
+pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
+
+/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
+const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
+const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
+
+/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
+/// conservative we limit the usage of small-sorts that require a stack array to types that fit
+/// within this limit.
+const MAX_STACK_ARRAY_SIZE: usize = 4096;
+
+enum UnstalbeSmallSort {
+    Fallback,
+    General,
+    Network,
+}
+
+const fn choose_unstable_small_sort<T: FreezeMarker>() -> UnstalbeSmallSort {
+    if T::is_copy()
+        && has_efficient_in_place_swap::<T>()
+        && (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
+    {
+        // Heuristic for int like types.
+        return UnstalbeSmallSort::Network;
+    }
+
+    if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
+        return UnstalbeSmallSort::General;
+    }
+
+    UnstalbeSmallSort::Fallback
+}
+
+const fn inst_unstable_small_sort<T: FreezeMarker, F: FnMut(&T, &T) -> bool>()
+-> fn(&mut [T], &mut F) {
+    match const { choose_unstable_small_sort::<T>() } {
+        UnstalbeSmallSort::Fallback => small_sort_fallback::<T, F>,
+        UnstalbeSmallSort::General => small_sort_general::<T, F>,
+        UnstalbeSmallSort::Network => small_sort_network::<T, F>,
+    }
+}
+
+fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    if v.len() >= 2 {
+        insertion_sort_shift_left(v, 1, is_less);
+    }
+}
+
+fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
+
+    let scratch = unsafe {
+        slice::from_raw_parts_mut(
+            stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
+            SMALL_SORT_GENERAL_SCRATCH_LEN,
+        )
+    };
+
+    small_sort_general_with_scratch(v, scratch, is_less);
+}
+
+fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if len < 2 {
+        return;
+    }
+
+    if scratch.len() < len + 16 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+    let len_div_2 = len / 2;
+
+    // SAFETY: See individual comments.
+    unsafe {
+        let scratch_base = scratch.as_mut_ptr() as *mut T;
+
+        let presorted_len = if const { mem::size_of::<T>() <= 16 } && len >= 16 {
+            // SAFETY: scratch_base is valid and has enough space.
+            sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
+            sort8_stable(
+                v_base.add(len_div_2),
+                scratch_base.add(len_div_2),
+                scratch_base.add(len + 8),
+                is_less,
+            );
+
+            8
+        } else if len >= 8 {
+            // SAFETY: scratch_base is valid and has enough space.
+            sort4_stable(v_base, scratch_base, is_less);
+            sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
+
+            4
+        } else {
+            ptr::copy_nonoverlapping(v_base, scratch_base, 1);
+            ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
+
+            1
+        };
+
+        for offset in [0, len_div_2] {
+            // SAFETY: at this point dst is initialized with presorted_len elements.
+            // We extend this to desired_len, src is valid for desired_len elements.
+            let src = v_base.add(offset);
+            let dst = scratch_base.add(offset);
+            let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
+
+            for i in presorted_len..desired_len {
+                ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
+                insert_tail(dst, dst.add(i), is_less);
+            }
+        }
+
+        // SAFETY: see comment in `CopyOnDrop::drop`.
+        let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
+
+        // SAFETY: at this point scratch_base is fully initialized, allowing us
+        // to use it as the source of our merge back into the original array.
+        // If a panic occurs we ensure the original array is restored to a valid
+        // permutation of the input through drop_guard. This technique is similar
+        // to ping-pong merging.
+        bidirectional_merge(
+            &*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
+            drop_guard.dst,
+            is_less,
+        );
+        mem::forget(drop_guard);
+    }
+}
+
+struct CopyOnDrop<T> {
+    src: *const T,
+    dst: *mut T,
+    len: usize,
+}
+
+impl<T> Drop for CopyOnDrop<T> {
+    fn drop(&mut self) {
+        // SAFETY: `src` must contain `len` initialized elements, and dst must
+        // be valid to write `len` elements.
+        unsafe {
+            ptr::copy_nonoverlapping(self.src, self.dst, self.len);
+        }
+    }
+}
+
+fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
+where
+    T: FreezeMarker,
+    F: FnMut(&T, &T) -> bool,
+{
+    // This implementation is tuned to be efficient for integer types.
+
+    let len = v.len();
+    if len < 2 {
+        return;
+    }
+
+    if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
+        intrinsics::abort();
+    }
+
+    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
+
+    let len_div_2 = len / 2;
+    let no_merge = len < 18;
+
+    let v_base = v.as_mut_ptr();
+    let initial_region_len = if no_merge { len } else { len_div_2 };
+    // SAFETY: Both possible values of `initial_region_len` are in-bounds.
+    let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
+
+    // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
+    loop {
+        let presorted_len = if region.len() >= 13 {
+            sort13_optimal(region, is_less);
+            13
+        } else if region.len() >= 9 {
+            sort9_optimal(region, is_less);
+            9
+        } else {
+            1
+        };
+
+        insertion_sort_shift_left(region, presorted_len, is_less);
+
+        if no_merge {
+            return;
+        }
+
+        if region.as_ptr() != v_base {
+            break;
+        }
+
+        // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
+        region =
+            unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
+    }
+
+    // SAFETY: We checked that T is Freeze and thus observation safe.
+    // Should is_less panic v was not modified in parity_merge and retains it's original input.
+    // scratch and v must not alias and scratch has v.len() space.
+    unsafe {
+        let scratch_base = stack_array.as_mut_ptr() as *mut T;
+        bidirectional_merge(
+            &mut *ptr::slice_from_raw_parts_mut(v_base, len),
+            scratch_base,
+            is_less,
+        );
+        ptr::copy_nonoverlapping(scratch_base, v_base, len);
+    }
+}
+
+/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
+/// value at position `b_pos` is less than the one at position `a_pos`.
+pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
+    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
+    unsafe {
+        let v_a = v_base.add(a_pos);
+        let v_b = v_base.add(b_pos);
+
+        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
+        // in a well defined state, without duplicates.
+
+        // Important to only swap if it is more and not if it is equal. is_less should return false for
+        // equal, so we don't swap.
+        let should_swap = is_less(&*v_b, &*v_a);
+
+        // This is a branchless version of swap if.
+        // The equivalent code with a branch would be:
+        //
+        // if should_swap {
+        //     ptr::swap(left, right, 1);
+        // }
+
+        // The goal is to generate cmov instructions here.
+        let left_swap = if should_swap { v_b } else { v_a };
+        let right_swap = if should_swap { v_a } else { v_b };
+
+        let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
+        ptr::copy(left_swap, v_a, 1);
+        ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
+    }
+}
+
+// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
+// performance impact.
+fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    if v.len() < 9 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+
+    // Optimal sorting network see:
+    // https://bertdobbelaere.github.io/sorting_networks.html.
+
+    // SAFETY: We checked the len.
+    unsafe {
+        swap_if_less(v_base, 0, 3, is_less);
+        swap_if_less(v_base, 1, 7, is_less);
+        swap_if_less(v_base, 2, 5, is_less);
+        swap_if_less(v_base, 4, 8, is_less);
+        swap_if_less(v_base, 0, 7, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 3, 8, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+        swap_if_less(v_base, 0, 2, is_less);
+        swap_if_less(v_base, 1, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 1, 4, is_less);
+        swap_if_less(v_base, 3, 6, is_less);
+        swap_if_less(v_base, 5, 7, is_less);
+        swap_if_less(v_base, 0, 1, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 3, 5, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 6, 7, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+    }
+}
+
+// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
+// performance impact.
+fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    if v.len() < 13 {
+        intrinsics::abort();
+    }
+
+    let v_base = v.as_mut_ptr();
+
+    // Optimal sorting network see:
+    // https://bertdobbelaere.github.io/sorting_networks.html.
+
+    // SAFETY: We checked the len.
+    unsafe {
+        swap_if_less(v_base, 0, 12, is_less);
+        swap_if_less(v_base, 1, 10, is_less);
+        swap_if_less(v_base, 2, 9, is_less);
+        swap_if_less(v_base, 3, 7, is_less);
+        swap_if_less(v_base, 5, 11, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 1, 6, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 11, is_less);
+        swap_if_less(v_base, 7, 9, is_less);
+        swap_if_less(v_base, 8, 10, is_less);
+        swap_if_less(v_base, 0, 4, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 6, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 11, 12, is_less);
+        swap_if_less(v_base, 4, 6, is_less);
+        swap_if_less(v_base, 5, 9, is_less);
+        swap_if_less(v_base, 8, 11, is_less);
+        swap_if_less(v_base, 10, 12, is_less);
+        swap_if_less(v_base, 0, 5, is_less);
+        swap_if_less(v_base, 3, 8, is_less);
+        swap_if_less(v_base, 4, 7, is_less);
+        swap_if_less(v_base, 6, 11, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 0, 1, is_less);
+        swap_if_less(v_base, 2, 5, is_less);
+        swap_if_less(v_base, 6, 9, is_less);
+        swap_if_less(v_base, 7, 8, is_less);
+        swap_if_less(v_base, 10, 11, is_less);
+        swap_if_less(v_base, 1, 3, is_less);
+        swap_if_less(v_base, 2, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+        swap_if_less(v_base, 9, 10, is_less);
+        swap_if_less(v_base, 1, 2, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 7, is_less);
+        swap_if_less(v_base, 6, 8, is_less);
+        swap_if_less(v_base, 2, 3, is_less);
+        swap_if_less(v_base, 4, 5, is_less);
+        swap_if_less(v_base, 6, 7, is_less);
+        swap_if_less(v_base, 8, 9, is_less);
+        swap_if_less(v_base, 3, 4, is_less);
+        swap_if_less(v_base, 5, 6, is_less);
+    }
+}
+
+/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
+///
+/// # Safety
+/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
+unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
+    // SAFETY: see individual comments.
+    unsafe {
+        // SAFETY: in-bounds as tail > begin.
+        let mut sift = tail.sub(1);
+        if !is_less(&*tail, &*sift) {
+            return;
+        }
+
+        // SAFETY: after this read tail is never read from again, as we only ever
+        // read from sift, sift < tail and we only ever decrease sift. Thus this is
+        // effectively a move, not a copy. Should a panic occur, or we have found
+        // the correct insertion position, gap_guard ensures the element is moved
+        // back into the array.
+        let tmp = ManuallyDrop::new(tail.read());
+        let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
+
+        loop {
+            // SAFETY: we move sift into the gap (which is valid), and point the
+            // gap guard destination at sift, ensuring that if a panic occurs the
+            // gap is once again filled.
+            ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
+            gap_guard.dst = sift;
+
+            if sift == begin {
+                break;
+            }
+
+            // SAFETY: we checked that sift != begin, thus this is in-bounds.
+            sift = sift.sub(1);
+            if !is_less(&tmp, &*sift) {
+                break;
+            }
+        }
+    }
+}
+
+/// Sort `v` assuming `v[..offset]` is already sorted.
+pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    offset: usize,
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if offset == 0 || offset > len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: see individual comments.
+    unsafe {
+        // We write this basic loop directly using pointers, as when we use a
+        // for loop LLVM likes to unroll this loop which we do not want.
+        // SAFETY: v_end is the one-past-end pointer, and we checked that
+        // offset <= len, thus tail is also in-bounds.
+        let v_base = v.as_mut_ptr();
+        let v_end = v_base.add(len);
+        let mut tail = v_base.add(offset);
+        while tail != v_end {
+            // SAFETY: v_base and tail are both valid pointers to elements, and
+            // v_base < tail since we checked offset != 0.
+            insert_tail(v_base, tail, is_less);
+
+            // SAFETY: we checked that tail is not yet the one-past-end pointer.
+            tail = tail.add(1);
+        }
+    }
+}
+
+/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
+/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
+pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
+    v_base: *const T,
+    dst: *mut T,
+    is_less: &mut F,
+) {
+    // By limiting select to picking pointers, we are guaranteed good cmov code-gen
+    // regardless of type T's size. Further this only does 5 instead of 6
+    // comparisons compared to a stable transposition 4 element sorting-network,
+    // and always copies each element exactly once.
+
+    // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
+    // thus in-bounds by the precondition.
+    unsafe {
+        // Stably create two pairs a <= b and c <= d.
+        let c1 = is_less(&*v_base.add(1), &*v_base);
+        let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
+        let a = v_base.add(c1 as usize);
+        let b = v_base.add(!c1 as usize);
+        let c = v_base.add(2 + c2 as usize);
+        let d = v_base.add(2 + (!c2 as usize));
+
+        // Compare (a, c) and (b, d) to identify max/min. We're left with two
+        // unknown elements, but because we are a stable sort we must know which
+        // one is leftmost and which one is rightmost.
+        // c3, c4 | min max unknown_left unknown_right
+        //  0,  0 |  a   d    b         c
+        //  0,  1 |  a   b    c         d
+        //  1,  0 |  c   d    a         b
+        //  1,  1 |  c   b    a         d
+        let c3 = is_less(&*c, &*a);
+        let c4 = is_less(&*d, &*b);
+        let min = select(c3, c, a);
+        let max = select(c4, b, d);
+        let unknown_left = select(c3, a, select(c4, c, b));
+        let unknown_right = select(c4, d, select(c3, b, c));
+
+        // Sort the last two unknown elements.
+        let c5 = is_less(&*unknown_right, &*unknown_left);
+        let lo = select(c5, unknown_right, unknown_left);
+        let hi = select(c5, unknown_left, unknown_right);
+
+        ptr::copy_nonoverlapping(min, dst, 1);
+        ptr::copy_nonoverlapping(lo, dst.add(1), 1);
+        ptr::copy_nonoverlapping(hi, dst.add(2), 1);
+        ptr::copy_nonoverlapping(max, dst.add(3), 1);
+    }
+
+    #[inline(always)]
+    fn select<T>(cond: bool, if_true: *const T, if_false: *const T) -> *const T {
+        if cond { if_true } else { if_false }
+    }
+}
+
+/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
+/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
+/// be stored in `dst[0..8]`.
+unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v_base: *mut T,
+    dst: *mut T,
+    scratch_base: *mut T,
+    is_less: &mut F,
+) {
+    // SAFETY: these pointers are all in-bounds by the precondition of our function.
+    unsafe {
+        sort4_stable(v_base, scratch_base, is_less);
+        sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
+    }
+
+    // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
+    // into dst.
+    unsafe {
+        bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
+    }
+}
+
+#[inline(always)]
+unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
+    mut left_src: *const T,
+    mut right_src: *const T,
+    mut dst: *mut T,
+    is_less: &mut F,
+) -> (*const T, *const T, *mut T) {
+    // This is a branchless merge utility function.
+    // The equivalent code with a branch would be:
+    //
+    // if !is_less(&*right_src, &*left_src) {
+    //     ptr::copy_nonoverlapping(left_src, dst, 1);
+    //     left_src = left_src.add(1);
+    // } else {
+    //     ptr::copy_nonoverlapping(right_src, dst, 1);
+    //     right_src = right_src.add(1);
+    // }
+    // dst = dst.add(1);
+
+    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
+    // to read and `dst` is valid to write, while not aliasing.
+    unsafe {
+        let is_l = !is_less(&*right_src, &*left_src);
+        let src = if is_l { left_src } else { right_src };
+        ptr::copy_nonoverlapping(src, dst, 1);
+        right_src = right_src.add(!is_l as usize);
+        left_src = left_src.add(is_l as usize);
+        dst = dst.add(1);
+    }
+
+    (left_src, right_src, dst)
+}
+
+#[inline(always)]
+unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
+    mut left_src: *const T,
+    mut right_src: *const T,
+    mut dst: *mut T,
+    is_less: &mut F,
+) -> (*const T, *const T, *mut T) {
+    // This is a branchless merge utility function.
+    // The equivalent code with a branch would be:
+    //
+    // if !is_less(&*right_src, &*left_src) {
+    //     ptr::copy_nonoverlapping(right_src, dst, 1);
+    //     right_src = right_src.wrapping_sub(1);
+    // } else {
+    //     ptr::copy_nonoverlapping(left_src, dst, 1);
+    //     left_src = left_src.wrapping_sub(1);
+    // }
+    // dst = dst.sub(1);
+
+    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
+    // to read and `dst` is valid to write, while not aliasing.
+    unsafe {
+        let is_l = !is_less(&*right_src, &*left_src);
+        let src = if is_l { right_src } else { left_src };
+        ptr::copy_nonoverlapping(src, dst, 1);
+        right_src = right_src.wrapping_sub(is_l as usize);
+        left_src = left_src.wrapping_sub(!is_l as usize);
+        dst = dst.sub(1);
+    }
+
+    (left_src, right_src, dst)
+}
+
+/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
+///
+/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
+/// adapted to only use merge up and down. In contrast to the original
+/// parity_merge function, it performs 2 writes instead of 4 per iteration.
+///
+/// # Safety
+/// The caller must guarantee that `dst` is valid for v.len() writes.
+/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
+///
+/// Note that T must be Freeze, the comparison function is evaluated on outdated
+/// temporary 'copies' that may not end up in the final array.
+unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
+    v: &[T],
+    dst: *mut T,
+    is_less: &mut F,
+) {
+    // It helps to visualize the merge:
+    //
+    // Initial:
+    //
+    //  |dst (in dst)
+    //  |left               |right
+    //  v                   v
+    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
+    //                     ^                   ^
+    //                     |left_rev           |right_rev
+    //                                         |dst_rev (in dst)
+    //
+    // After:
+    //
+    //                      |dst (in dst)
+    //        |left         |           |right
+    //        v             v           v
+    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
+    //       ^             ^           ^
+    //       |left_rev     |           |right_rev
+    //                     |dst_rev (in dst)
+    //
+    // In each iteration one of left or right moves up one position, and one of
+    // left_rev or right_rev moves down one position, whereas dst always moves
+    // up one position and dst_rev always moves down one position. Assuming
+    // the input was sorted and the comparison function is correctly implemented
+    // at the end we will have left == left_rev + 1, and right == right_rev + 1,
+    // fully consuming the input having written it to dst.
+
+    let len = v.len();
+    let src = v.as_ptr();
+
+    let len_div_2 = len / 2;
+
+    // SAFETY: The caller has to ensure that len >= 2.
+    unsafe {
+        intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
+    }
+
+    // SAFETY: no matter what the result of the user-provided comparison function
+    // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
+    // will always be in bounds if the caller guarantees that `dst` is valid for
+    // `v.len()` writes.
+    unsafe {
+        let mut left = src;
+        let mut right = src.add(len_div_2);
+        let mut dst = dst;
+
+        let mut left_rev = src.add(len_div_2 - 1);
+        let mut right_rev = src.add(len - 1);
+        let mut dst_rev = dst.add(len - 1);
+
+        for _ in 0..len_div_2 {
+            (left, right, dst) = merge_up(left, right, dst, is_less);
+            (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
+        }
+
+        let left_end = left_rev.wrapping_add(1);
+        let right_end = right_rev.wrapping_add(1);
+
+        // Odd length, so one element is left unconsumed in the input.
+        if len % 2 != 0 {
+            let left_nonempty = left < left_end;
+            let last_src = if left_nonempty { left } else { right };
+            ptr::copy_nonoverlapping(last_src, dst, 1);
+            left = left.add(left_nonempty as usize);
+            right = right.add((!left_nonempty) as usize);
+        }
+
+        // We now should have consumed the full input exactly once. This can
+        // only fail if the comparison operator fails to be Ord, in which case
+        // we will panic and never access the inconsistent state in dst.
+        if left != left_end || right != right_end {
+            panic_on_ord_violation();
+        }
+    }
+}
+
+#[inline(never)]
+fn panic_on_ord_violation() -> ! {
+    panic!("Ord violation");
+}
+
+#[must_use]
+pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
+    // Heuristic that holds true on all tested 64-bit capable architectures.
+    mem::size_of::<T>() <= 8 // mem::size_of::<u64>()
+}
+
+#[test]
+fn type_info() {
+    assert!(has_efficient_in_place_swap::<i32>());
+    assert!(has_efficient_in_place_swap::<u64>());
+    assert!(!has_efficient_in_place_swap::<u128>());
+    assert!(!has_efficient_in_place_swap::<String>());
+}
+
+/// SAFETY: Only used for run-time optimization heuristic.
+#[rustc_unsafe_specialization_marker]
+trait CopyMarker {}
+
+impl<T: Copy> CopyMarker for T {}
+
+#[const_trait]
+trait IsCopy {
+    fn is_copy() -> bool;
+}
+
+impl<T> const IsCopy for T {
+    default fn is_copy() -> bool {
+        false
+    }
+}
+impl<T: CopyMarker> const IsCopy for T {
+    fn is_copy() -> bool {
+        true
+    }
+}
--- a/library/core/src/slice/sort/stable/drift.rs
+++ b/library/core/src/slice/sort/stable/drift.rs
@ -0,0 +1,300 @@
+//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down
+//! Quicksort.
+
+use crate::cmp;
+use crate::intrinsics;
+use crate::mem::MaybeUninit;
+
+use crate::slice::sort::shared::find_existing_run;
+use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
+use crate::slice::sort::stable::merge::merge;
+use crate::slice::sort::stable::quicksort::quicksort;
+
+/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true,
+/// it will only do small-sorts and physical merges, ensuring O(N * log(N))
+/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2,
+/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort.
+/// Fully ascending and descending inputs will be sorted with exactly N - 1
+/// comparisons.
+///
+/// This is the main loop for driftsort, which uses powersort's heuristic to
+/// determine in which order to merge runs, see below for details.
+pub fn sort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    eager_sort: bool,
+    is_less: &mut F,
+) {
+    let len = v.len();
+    if len < 2 {
+        return; // Removing this length check *increases* code size.
+    }
+    let scale_factor = merge_tree_scale_factor(len);
+
+    // It's important to have a relatively high entry barrier for pre-sorted
+    // runs, as the presence of a single such run will force on average several
+    // merge operations and shrink the maximum quicksort size a lot. For that
+    // reason we use sqrt(len) as our pre-sorted run threshold.
+    const MIN_SQRT_RUN_LEN: usize = 64;
+    let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) {
+        // For small input length `MIN_SQRT_RUN_LEN` would break pattern
+        // detection of full or nearly sorted inputs.
+        cmp::min(len - len / 2, MIN_SQRT_RUN_LEN)
+    } else {
+        sqrt_approx(len)
+    };
+
+    // (stack_len, runs, desired_depths) together form a stack maintaining run
+    // information for the powersort heuristic. desired_depths[i] is the desired
+    // depth of the merge node that merges runs[i] with the run that comes after
+    // it.
+    let mut stack_len = 0;
+    let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit();
+    let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast();
+    let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit();
+    let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast();
+
+    let mut scan_idx = 0;
+    let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run.
+    loop {
+        // Compute the next run and the desired depth of the merge node between
+        // prev_run and next_run. On the last iteration we create a dummy run
+        // with root-level desired depth to fully collapse the merge tree.
+        let (next_run, desired_depth);
+        if scan_idx < len {
+            next_run =
+                create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less);
+            desired_depth = merge_tree_depth(
+                scan_idx - prev_run.len(),
+                scan_idx,
+                scan_idx + next_run.len(),
+                scale_factor,
+            );
+        } else {
+            next_run = DriftsortRun::new_sorted(0);
+            desired_depth = 0;
+        };
+
+        // Process the merge nodes between earlier runs[i] that have a desire to
+        // be deeper in the merge tree than the merge node for the splitpoint
+        // between prev_run and next_run.
+        //
+        // SAFETY: first note that this is the only place we modify stack_len,
+        // runs or desired depths. We maintain the following invariants:
+        //  1. The first stack_len elements of runs/desired_depths are initialized.
+        //  2. For all valid i > 0, desired_depths[i] < desired_depths[i+1].
+        //  3. The sum of all valid runs[i].len() plus prev_run.len() equals
+        //     scan_idx.
+        unsafe {
+            while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth {
+                // Desired depth greater than the upcoming desired depth, pop
+                // left neighbor run from stack and merge into prev_run.
+                let left = *runs.add(stack_len - 1);
+                let merged_len = left.len() + prev_run.len();
+                let merge_start_idx = scan_idx - merged_len;
+                let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx);
+                prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less);
+                stack_len -= 1;
+            }
+
+            // We now know that desired_depths[stack_len - 1] < desired_depth,
+            // maintaining our invariant. This also guarantees we don't overflow
+            // the stack as merge_tree_depth(..) <= 64 and thus we can only have
+            // 64 distinct values on the stack before pushing, plus our initial
+            // dummy run, while our capacity is 66.
+            *runs.add(stack_len) = prev_run;
+            *desired_depths.add(stack_len) = desired_depth;
+            stack_len += 1;
+        }
+
+        // Break before overriding the last run with our dummy run.
+        if scan_idx >= len {
+            break;
+        }
+
+        scan_idx += next_run.len();
+        prev_run = next_run;
+    }
+
+    if !prev_run.sorted() {
+        stable_quicksort(v, scratch, is_less);
+    }
+}
+
+// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally
+// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild.
+//
+// This method forms a binary merge tree, where each internal node corresponds
+// to a splitting point between the adjacent runs that have to be merged. If we
+// visualize our array as the number line from 0 to 1, we want to find the
+// dyadic fraction with smallest denominator that lies between the midpoints of
+// our to-be-merged slices. The exponent in the dyadic fraction indicates the
+// desired depth in the binary merge tree this internal node wishes to have.
+// This does not always correspond to the actual depth due to the inherent
+// imbalance in runs, but we follow it as closely as possible.
+//
+// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then
+// finding the simplest dyadic fraction between midpoints corresponds to finding
+// the most significant bit difference of the midpoints. We save scale_factor =
+// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding
+// having to repeatedly do integer divides. This rescaling isn't exact when n is
+// not a power of two since we use integers and not reals, but the result is
+// very close, and in fact when n < 2^30 the resulting tree is equivalent as the
+// approximation errors stay entirely in the lower order bits.
+//
+// Thus for the splitting point between two adjacent slices [a, b) and [b, c)
+// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f),
+// where CLZ counts the number of leading zeros in an integer and f is our scale
+// factor. Note that we omitted the division by two in the midpoint
+// calculations, as this simply shifts the bits by one position (and thus always
+// adds one to the result), and we only care about the relative depths.
+//
+// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then
+//    x < (2^62 / n + 1) * 2n
+//    x < 2^63 + 2n
+// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not
+// overflow.
+#[inline(always)]
+fn merge_tree_scale_factor(n: usize) -> u64 {
+    if usize::BITS > u64::BITS {
+        panic!("Platform not supported");
+    }
+
+    ((1 << 62) + n as u64 - 1) / n as u64
+}
+
+// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must
+// differ in some bit, and is <= 64 always.
+#[inline(always)]
+fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 {
+    let x = left as u64 + mid as u64;
+    let y = mid as u64 + right as u64;
+    ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8
+}
+
+fn sqrt_approx(n: usize) -> usize {
+    // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these
+    // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer
+    // log floors we want to add 0.5 to compensate for this on average, so our
+    // initial approximation is 2^((1 + floor(log2(n))) / 2).
+    //
+    // We then apply an iteration of Newton's method to improve our
+    // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2.
+    //
+    // Finally we note that the exponentiation / division can be done directly
+    // with shifts. We OR with 1 to avoid zero-checks in the integer log.
+    let ilog = (n | 1).ilog2();
+    let shift = (1 + ilog) / 2;
+    ((1 << shift) + (n >> shift)) / 2
+}
+
+// Lazy logical runs as in Glidesort.
+#[inline(always)]
+fn logical_merge<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    left: DriftsortRun,
+    right: DriftsortRun,
+    is_less: &mut F,
+) -> DriftsortRun {
+    // If one or both of the runs are sorted do a physical merge, using
+    // quicksort to sort the unsorted run if present. We also *need* to
+    // physically merge if the combined runs would not fit in the scratch space
+    // anymore (as this would mean we are no longer able to to quicksort them).
+    let len = v.len();
+    let can_fit_in_scratch = len <= scratch.len();
+    if !can_fit_in_scratch || left.sorted() || right.sorted() {
+        if !left.sorted() {
+            stable_quicksort(&mut v[..left.len()], scratch, is_less);
+        }
+        if !right.sorted() {
+            stable_quicksort(&mut v[left.len()..], scratch, is_less);
+        }
+        merge(v, scratch, left.len(), is_less);
+
+        DriftsortRun::new_sorted(len)
+    } else {
+        DriftsortRun::new_unsorted(len)
+    }
+}
+
+/// Creates a new logical run.
+///
+/// A logical run can either be sorted or unsorted. If there is a pre-existing
+/// run that clears the `min_good_run_len` threshold it is returned as a sorted
+/// run. If not, the result depends on the value of `eager_sort`. If it is true,
+/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it
+/// is false an unsorted run of length `min_good_run_len` is returned.
+fn create_run<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    min_good_run_len: usize,
+    eager_sort: bool,
+    is_less: &mut F,
+) -> DriftsortRun {
+    let len = v.len();
+    if len >= min_good_run_len {
+        let (run_len, was_reversed) = find_existing_run(v, is_less);
+
+        // SAFETY: find_existing_run promises to return a valid run_len.
+        unsafe { intrinsics::assume(run_len <= len) };
+
+        if run_len >= min_good_run_len {
+            if was_reversed {
+                v[..run_len].reverse();
+            }
+
+            return DriftsortRun::new_sorted(run_len);
+        }
+    }
+
+    if eager_sort {
+        // We call quicksort with a len that will immediately call small-sort.
+        // By not calling the small-sort directly here it can always be inlined into
+        // the quicksort itself, making the recursive base case faster and is generally
+        // more binary-size efficient.
+        let eager_run_len = cmp::min(T::small_sort_threshold(), len);
+        quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less);
+        DriftsortRun::new_sorted(eager_run_len)
+    } else {
+        DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len))
+    }
+}
+
+fn stable_quicksort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+    // The binary OR by one is used to eliminate the zero-check in the logarithm.
+    let limit = 2 * (v.len() | 1).ilog2();
+    quicksort(v, scratch, limit, None, is_less);
+}
+
+/// Compactly stores the length of a run, and whether or not it is sorted. This
+/// can always fit in a usize because the maximum slice length is isize::MAX.
+#[derive(Copy, Clone)]
+struct DriftsortRun(usize);
+
+impl DriftsortRun {
+    #[inline(always)]
+    fn new_sorted(length: usize) -> Self {
+        Self((length << 1) | 1)
+    }
+
+    #[inline(always)]
+    fn new_unsorted(length: usize) -> Self {
+        Self(length << 1)
+    }
+
+    #[inline(always)]
+    fn sorted(self) -> bool {
+        self.0 & 1 == 1
+    }
+
+    #[inline(always)]
+    fn len(self) -> usize {
+        self.0 >> 1
+    }
+}
--- a/library/core/src/slice/sort/stable/merge.rs
+++ b/library/core/src/slice/sort/stable/merge.rs
@ -0,0 +1,151 @@
+//! This module contains logic for performing a merge of two sorted sub-slices.
+
+use crate::cmp;
+use crate::mem::MaybeUninit;
+use crate::ptr;
+
+/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `scratch` as
+/// temporary storage, and stores the result into `v[..]`.
+pub fn merge<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    mid: usize,
+    is_less: &mut F,
+) {
+    let len = v.len();
+
+    if mid == 0 || mid >= len || scratch.len() < cmp::min(mid, len - mid) {
+        return;
+    }
+
+    // SAFETY: We checked that the two slices are non-empty and `mid` is in-bounds.
+    // We checked that the buffer `scratch` has enough capacity to hold a copy of
+    // the shorter slice. `merge_up` and `merge_down` are written in such a way that
+    // they uphold the contract described in `MergeState::drop`.
+    unsafe {
+        // The merge process first copies the shorter run into `buf`. Then it traces
+        // the newly copied run and the longer run forwards (or backwards), comparing
+        // their next unconsumed elements and copying the lesser (or greater) one into `v`.
+        //
+        // As soon as the shorter run is fully consumed, the process is done. If the
+        // longer run gets consumed first, then we must copy whatever is left of the
+        // shorter run into the remaining gap in `v`.
+        //
+        // Intermediate state of the process is always tracked by `gap`, which serves
+        // two purposes:
+        //  1. Protects integrity of `v` from panics in `is_less`.
+        //  2. Fills the remaining gap in `v` if the longer run gets consumed first.
+
+        let buf = MaybeUninit::slice_as_mut_ptr(scratch);
+
+        let v_base = v.as_mut_ptr();
+        let v_mid = v_base.add(mid);
+        let v_end = v_base.add(len);
+
+        let left_len = mid;
+        let right_len = len - mid;
+
+        let left_is_shorter = left_len <= right_len;
+        let save_base = if left_is_shorter { v_base } else { v_mid };
+        let save_len = if left_is_shorter { left_len } else { right_len };
+
+        ptr::copy_nonoverlapping(save_base, buf, save_len);
+
+        let mut merge_state = MergeState { start: buf, end: buf.add(save_len), dst: save_base };
+
+        if left_is_shorter {
+            merge_state.merge_up(v_mid, v_end, is_less);
+        } else {
+            merge_state.merge_down(v_base, buf, v_end, is_less);
+        }
+        // Finally, `merge_state` gets dropped. If the shorter run was not fully
+        // consumed, whatever remains of it will now be copied into the hole in `v`.
+    }
+
+    // When dropped, copies the range `start..end` into `dst..`.
+    struct MergeState<T> {
+        start: *mut T,
+        end: *mut T,
+        dst: *mut T,
+    }
+
+    impl<T> MergeState<T> {
+        /// # Safety
+        /// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is
+        /// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice
+        /// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be
+        /// read. This function MUST only be called once.
+        unsafe fn merge_up<F: FnMut(&T, &T) -> bool>(
+            &mut self,
+            mut right: *const T,
+            right_end: *const T,
+            is_less: &mut F,
+        ) {
+            // SAFETY: See function safety comment.
+            unsafe {
+                let left = &mut self.start;
+                let out = &mut self.dst;
+
+                while *left != self.end && right as *const T != right_end {
+                    let consume_left = !is_less(&*right, &**left);
+
+                    let src = if consume_left { *left } else { right };
+                    ptr::copy_nonoverlapping(src, *out, 1);
+
+                    *left = left.add(consume_left as usize);
+                    right = right.add(!consume_left as usize);
+
+                    *out = out.add(1);
+                }
+            }
+        }
+
+        /// # Safety
+        /// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is
+        /// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice
+        /// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be
+        /// read. This function MUST only be called once.
+        unsafe fn merge_down<F: FnMut(&T, &T) -> bool>(
+            &mut self,
+            left_end: *const T,
+            right_end: *const T,
+            mut out: *mut T,
+            is_less: &mut F,
+        ) {
+            // SAFETY: See function safety comment.
+            unsafe {
+                loop {
+                    let left = self.dst.sub(1);
+                    let right = self.end.sub(1);
+                    out = out.sub(1);
+
+                    let consume_left = is_less(&*right, &*left);
+
+                    let src = if consume_left { left } else { right };
+                    ptr::copy_nonoverlapping(src, out, 1);
+
+                    self.dst = left.add(!consume_left as usize);
+                    self.end = right.add(consume_left as usize);
+
+                    if self.dst as *const T == left_end || self.end as *const T == right_end {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    impl<T> Drop for MergeState<T> {
+        fn drop(&mut self) {
+            // SAFETY: The user of MergeState MUST ensure, that at any point this drop
+            // impl MAY run, for example when the user provided `is_less` panics, that
+            // copying the contiguous region between `start` and `end` to `dst` will
+            // leave the input slice `v` with each original element and all possible
+            // modifications observed.
+            unsafe {
+                let len = self.end.sub_ptr(self.start);
+                ptr::copy_nonoverlapping(self.start, self.dst, len);
+            }
+        }
+    }
+}
--- a/library/core/src/slice/sort/stable/mod.rs
+++ b/library/core/src/slice/sort/stable/mod.rs
@ -0,0 +1,116 @@
+//! This module contains the entry points for `slice::sort`.
+
+use crate::cmp;
+use crate::intrinsics;
+use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+
+use crate::slice::sort::shared::smallsort::{
+    insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
+};
+
+pub(crate) mod drift;
+pub(crate) mod merge;
+pub(crate) mod quicksort;
+
+/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
+/// Design document:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md
+///
+/// Upholds all safety properties outlined here:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
+#[inline(always)]
+pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
+    // Arrays of zero-sized types are always all-equal, and thus sorted.
+    if T::IS_ZST {
+        return;
+    }
+
+    // Instrumenting the standard library showed that 90+% of the calls to sort
+    // by rustc are either of size 0 or 1.
+    let len = v.len();
+    if intrinsics::likely(len < 2) {
+        return;
+    }
+
+    // More advanced sorting methods than insertion sort are faster if called in
+    // a hot loop for small inputs, but for general-purpose code the small
+    // binary size of insertion sort is more important. The instruction cache in
+    // modern processors is very valuable, and for a single sort call in general
+    // purpose code any gains from an advanced method are cancelled by i-cache
+    // misses during the sort, and thrashing the i-cache for surrounding code.
+    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+        insertion_sort_shift_left(v, 1, is_less);
+        return;
+    }
+
+    driftsort_main::<T, F, BufT>(v, is_less);
+}
+
+/// See [`sort`]
+///
+/// Deliberately don't inline the main sorting routine entrypoint to ensure the
+/// inlined insertion sort i-cache footprint remains minimal.
+#[inline(never)]
+fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
+    // By allocating n elements of memory we can ensure the entire input can
+    // be sorted using stable quicksort, which allows better performance on
+    // random and low-cardinality distributions. However, we still want to
+    // reduce our memory usage to n / 2 for large inputs. We do this by scaling
+    // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
+    // small inputs and n / 2 for large inputs, without a sudden drop off. We
+    // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
+    // small-sort always needs this much memory.
+    const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
+    let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
+    let len = v.len();
+    let alloc_len =
+        cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);
+
+    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+    let mut stack_buf = AlignedStorage::<T, 4096>::new();
+    let stack_scratch = stack_buf.as_uninit_slice_mut();
+    let mut heap_buf;
+    let scratch = if stack_scratch.len() >= alloc_len {
+        stack_scratch
+    } else {
+        heap_buf = BufT::with_capacity(alloc_len);
+        heap_buf.as_uninit_slice_mut()
+    };
+
+    // For small inputs using quicksort is not yet beneficial, and a single
+    // small-sort or two small-sorts plus a single merge outperforms it, so use
+    // eager mode.
+    let eager_sort = len <= T::small_sort_threshold() * 2;
+    crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
+}
+
+#[doc(hidden)]
+/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
+/// possible. This trait can then be implemented in a place that has access to allocation.
+pub trait BufGuard<T> {
+    /// Creates new buffer that holds at least `capacity` memory.
+    fn with_capacity(capacity: usize) -> Self;
+    /// Returns mutable access to uninitialized memory owned by the buffer.
+    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
+}
+
+#[repr(C)]
+struct AlignedStorage<T, const N: usize> {
+    _align: [T; 0],
+    storage: [MaybeUninit<u8>; N],
+}
+
+impl<T, const N: usize> AlignedStorage<T, N> {
+    fn new() -> Self {
+        Self { _align: [], storage: MaybeUninit::uninit_array() }
+    }
+
+    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
+        let len = N / mem::size_of::<T>();
+
+        // SAFETY: `_align` ensures we are correctly aligned.
+        unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
+    }
+}
--- a/library/core/src/slice/sort/stable/quicksort.rs
+++ b/library/core/src/slice/sort/stable/quicksort.rs
@ -0,0 +1,267 @@
+//! This module contains a stable quicksort and partition implementation.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop, MaybeUninit};
+use crate::ptr;
+
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl;
+use crate::slice::sort::shared::FreezeMarker;
+
+/// Sorts `v` recursively using quicksort.
+///
+/// `limit` when initialized with `c*log(v.len())` for some c ensures we do not
+/// overflow the stack or go quadratic.
+#[inline(never)]
+pub fn quicksort<T, F: FnMut(&T, &T) -> bool>(
+    mut v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    mut limit: u32,
+    mut left_ancestor_pivot: Option<&T>,
+    is_less: &mut F,
+) {
+    loop {
+        let len = v.len();
+
+        if len <= T::small_sort_threshold() {
+            T::small_sort(v, scratch, is_less);
+            return;
+        }
+
+        if limit == 0 {
+            // We have had too many bad pivots, switch to O(n log n) fallback
+            // algorithm. In our case that is driftsort in eager mode.
+            crate::slice::sort::stable::drift::sort(v, scratch, true, is_less);
+            return;
+        }
+        limit -= 1;
+
+        let pivot_pos = choose_pivot(v, is_less);
+        // SAFETY: choose_pivot promises to return a valid pivot index.
+        unsafe {
+            intrinsics::assume(pivot_pos < v.len());
+        }
+
+        // SAFETY: We only access the temporary copy for Freeze types, otherwise
+        // self-modifications via `is_less` would not be observed and this would
+        // be unsound. Our temporary copy does not escape this scope.
+        let pivot_copy = unsafe { ManuallyDrop::new(ptr::read(&v[pivot_pos])) };
+        let pivot_ref = (!has_direct_interior_mutability::<T>()).then_some(&*pivot_copy);
+
+        // We choose a pivot, and check if this pivot is equal to our left
+        // ancestor. If true, we do a partition putting equal elements on the
+        // left and do not recurse on it. This gives O(n log k) sorting for k
+        // distinct values, a strategy borrowed from pdqsort. For types with
+        // interior mutability we can't soundly create a temporary copy of the
+        // ancestor pivot, and use left_partition_len == 0 as our method for
+        // detecting when we re-use a pivot, which means we do at most three
+        // partition operations with pivot p instead of the optimal two.
+        let mut perform_equal_partition = false;
+        if let Some(la_pivot) = left_ancestor_pivot {
+            perform_equal_partition = !is_less(la_pivot, &v[pivot_pos]);
+        }
+
+        let mut left_partition_len = 0;
+        if !perform_equal_partition {
+            left_partition_len = stable_partition(v, scratch, pivot_pos, false, is_less);
+            perform_equal_partition = left_partition_len == 0;
+        }
+
+        if perform_equal_partition {
+            let mid_eq = stable_partition(v, scratch, pivot_pos, true, &mut |a, b| !is_less(b, a));
+            v = &mut v[mid_eq..];
+            left_ancestor_pivot = None;
+            continue;
+        }
+
+        // Process left side with the next loop iter, right side with recursion.
+        let (left, right) = v.split_at_mut(left_partition_len);
+        quicksort(right, scratch, limit, pivot_ref, is_less);
+        v = left;
+    }
+}
+
+/// Partitions `v` using pivot `p = v[pivot_pos]` and returns the number of
+/// elements less than `p`. The relative order of elements that compare < p and
+/// those that compare >= p is preserved - it is a stable partition.
+///
+/// If `is_less` is not a strict total order or panics, `scratch.len() < v.len()`,
+/// or `pivot_pos >= v.len()`, the result and `v`'s state is sound but unspecified.
+fn stable_partition<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    pivot_pos: usize,
+    pivot_goes_left: bool,
+    is_less: &mut F,
+) -> usize {
+    let len = v.len();
+
+    if intrinsics::unlikely(scratch.len() < len || pivot_pos >= len) {
+        core::intrinsics::abort()
+    }
+
+    let v_base = v.as_ptr();
+    let scratch_base = MaybeUninit::slice_as_mut_ptr(scratch);
+
+    // The core idea is to write the values that compare as less-than to the left
+    // side of `scratch`, while the values that compared as greater or equal than
+    // `v[pivot_pos]` go to the right side of `scratch` in reverse. See
+    // PartitionState for details.
+
+    // SAFETY: see individual comments.
+    unsafe {
+        // SAFETY: we made sure the scratch has length >= len and that pivot_pos
+        // is in-bounds. v and scratch are disjoint slices.
+        let pivot = v_base.add(pivot_pos);
+        let mut state = PartitionState::new(v_base, scratch_base, len);
+
+        let mut pivot_in_scratch = ptr::null_mut();
+        let mut loop_end_pos = pivot_pos;
+
+        // SAFETY: this loop is equivalent to calling state.partition_one
+        // exactly len times.
+        loop {
+            // Ideally the outer loop won't be unrolled, to save binary size,
+            // but we do want the inner loop to be unrolled for small types, as
+            // this gave significant performance boosts in benchmarks. Unrolling
+            // through for _ in 0..UNROLL_LEN { .. } instead of manually improves
+            // compile times but has a ~10-20% performance penalty on opt-level=s.
+            if const { mem::size_of::<T>() <= 16 } {
+                const UNROLL_LEN: usize = 4;
+                let unroll_end = v_base.add(loop_end_pos.saturating_sub(UNROLL_LEN - 1));
+                while state.scan < unroll_end {
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                    state.partition_one(is_less(&*state.scan, &*pivot));
+                }
+            }
+
+            let loop_end = v_base.add(loop_end_pos);
+            while state.scan < loop_end {
+                state.partition_one(is_less(&*state.scan, &*pivot));
+            }
+
+            if loop_end_pos == len {
+                break;
+            }
+
+            // We avoid comparing pivot with itself, as this could create deadlocks for
+            // certain comparison operators. We also store its location later for later.
+            pivot_in_scratch = state.partition_one(pivot_goes_left);
+
+            loop_end_pos = len;
+        }
+
+        // `pivot` must be copied into its correct position again, because a
+        // comparison operator might have modified it.
+        if has_direct_interior_mutability::<T>() {
+            ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
+        }
+
+        // SAFETY: partition_one being called exactly len times guarantees that scratch
+        // is initialized with a permuted copy of `v`, and that num_left <= v.len().
+        // Copying scratch[0..num_left] and scratch[num_left..v.len()] back is thus
+        // sound, as the values in scratch will never be read again, meaning our copies
+        // semantically act as moves, permuting `v`.
+
+        // Copy all the elements < p directly from swap to v.
+        let v_base = v.as_mut_ptr();
+        ptr::copy_nonoverlapping(scratch_base, v_base, state.num_left);
+
+        // Copy the elements >= p in reverse order.
+        for i in 0..len - state.num_left {
+            ptr::copy_nonoverlapping(
+                scratch_base.add(len - 1 - i),
+                v_base.add(state.num_left + i),
+                1,
+            );
+        }
+
+        state.num_left
+    }
+}
+
+struct PartitionState<T> {
+    // The start of the scratch auxiliary memory.
+    scratch_base: *mut T,
+    // The current element that is being looked at, scans left to right through slice.
+    scan: *const T,
+    // Counts the number of elements that went to the left side, also works around:
+    // https://github.com/rust-lang/rust/issues/117128
+    num_left: usize,
+    // Reverse scratch output pointer.
+    scratch_rev: *mut T,
+}
+
+impl<T> PartitionState<T> {
+    /// # Safety
+    /// scan and scratch must point to valid disjoint buffers of length len. The
+    /// scan buffer must be initialized.
+    unsafe fn new(scan: *const T, scratch: *mut T, len: usize) -> Self {
+        // SAFETY: See function safety comment.
+        unsafe { Self { scratch_base: scratch, scan, num_left: 0, scratch_rev: scratch.add(len) } }
+    }
+
+    /// Depending on the value of `towards_left` this function will write a value
+    /// to the growing left or right side of the scratch memory. This forms the
+    /// branchless core of the partition.
+    ///
+    /// # Safety
+    /// This function may be called at most `len` times. If it is called exactly
+    /// `len` times the scratch buffer then contains a copy of each element from
+    /// the scan buffer exactly once - a permutation, and num_left <= len.
+    unsafe fn partition_one(&mut self, towards_left: bool) -> *mut T {
+        // SAFETY: see individual comments.
+        unsafe {
+            // SAFETY: in-bounds because this function is called at most len times, and thus
+            // right now is incremented at most len - 1 times. Similarly, num_left < len and
+            // num_right < len, where num_right == i - num_left at the start of the ith
+            // iteration (zero-indexed).
+            self.scratch_rev = self.scratch_rev.sub(1);
+
+            // SAFETY: now we have scratch_rev == base + len - (i + 1). This means
+            // scratch_rev + num_left == base + len - 1 - num_right < base + len.
+            let dst_base = if towards_left { self.scratch_base } else { self.scratch_rev };
+            let dst = dst_base.add(self.num_left);
+            ptr::copy_nonoverlapping(self.scan, dst, 1);
+
+            self.num_left += towards_left as usize;
+            self.scan = self.scan.add(1);
+            dst
+        }
+    }
+}
+
+#[const_trait]
+trait IsFreeze {
+    fn is_freeze() -> bool;
+}
+
+impl<T> const IsFreeze for T {
+    default fn is_freeze() -> bool {
+        false
+    }
+}
+impl<T: FreezeMarker> const IsFreeze for T {
+    fn is_freeze() -> bool {
+        true
+    }
+}
+
+#[must_use]
+const fn has_direct_interior_mutability<T>() -> bool {
+    // If a type has interior mutability it may alter itself during comparison
+    // in a way that must be preserved after the sort operation concludes.
+    // Otherwise a type like Mutex<Option<Box<str>>> could lead to double free.
+    !T::is_freeze()
+}
+
+#[test]
+fn freeze_check() {
+    assert!(!has_direct_interior_mutability::<u32>());
+    assert!(!has_direct_interior_mutability::<[u128; 2]>());
+
+    assert!(has_direct_interior_mutability::<crate::cell::Cell<u32>>());
+    assert!(has_direct_interior_mutability::<crate::sync::Mutex<u32>>());
+}
--- a/library/core/src/slice/sort/unstable/heapsort.rs
+++ b/library/core/src/slice/sort/unstable/heapsort.rs
@ -0,0 +1,80 @@
+//! This module contains a branchless heapsort as fallback for unstable quicksort.
+
+use crate::intrinsics;
+use crate::ptr;
+
+/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
+///
+/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
+/// fallback.
+///
+/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
+#[inline(never)]
+pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: See function safety.
+    unsafe {
+        intrinsics::assume(v.len() >= 2);
+
+        // Build the heap in linear time.
+        for i in (0..v.len() / 2).rev() {
+            sift_down(v, i, is_less);
+        }
+
+        // Pop maximal elements from the heap.
+        for i in (1..v.len()).rev() {
+            v.swap(0, i);
+            sift_down(&mut v[..i], 0, is_less);
+        }
+    }
+}
+
+// This binary heap respects the invariant `parent >= child`.
+//
+// SAFETY: The caller has to guarantee that node < `v.len()`.
+#[inline(never)]
+unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: See function safety.
+    unsafe {
+        intrinsics::assume(node < v.len());
+    }
+
+    let len = v.len();
+
+    let v_base = v.as_mut_ptr();
+
+    loop {
+        // Children of `node`.
+        let mut child = 2 * node + 1;
+        if child >= len {
+            break;
+        }
+
+        // SAFETY: The invariants and checks guarantee that both node and child are in-bounds.
+        unsafe {
+            // Choose the greater child.
+            if child + 1 < len {
+                // We need a branch to be sure not to out-of-bounds index,
+                // but it's highly predictable.  The comparison, however,
+                // is better done branchless, especially for primitives.
+                child += is_less(&*v_base.add(child), &*v_base.add(child + 1)) as usize;
+            }
+
+            // Stop if the invariant holds at `node`.
+            if !is_less(&*v_base.add(node), &*v_base.add(child)) {
+                break;
+            }
+
+            // Swap `node` with the greater child, move one step down, and continue sifting. This
+            // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
+            ptr::swap(v_base.add(node), v_base.add(child));
+        }
+
+        node = child;
+    }
+}
--- a/library/core/src/slice/sort/unstable/mod.rs
+++ b/library/core/src/slice/sort/unstable/mod.rs
@ -0,0 +1,76 @@
+//! This module contains the entry points for `slice::sort_unstable`.
+
+use crate::intrinsics;
+use crate::mem::SizedTypeProperties;
+
+use crate::slice::sort::shared::find_existing_run;
+use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+
+pub(crate) mod heapsort;
+pub(crate) mod quicksort;
+
+/// Unstable sort called ipnsort by Lukas Bergdoll.
+/// Design document:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md
+///
+/// Upholds all safety properties outlined here:
+/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md
+#[inline(always)]
+pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    // Arrays of zero-sized types are always all-equal, and thus sorted.
+    if T::IS_ZST {
+        return;
+    }
+
+    // Instrumenting the standard library showed that 90+% of the calls to sort
+    // by rustc are either of size 0 or 1.
+    let len = v.len();
+    if intrinsics::likely(len < 2) {
+        return;
+    }
+
+    // More advanced sorting methods than insertion sort are faster if called in
+    // a hot loop for small inputs, but for general-purpose code the small
+    // binary size of insertion sort is more important. The instruction cache in
+    // modern processors is very valuable, and for a single sort call in general
+    // purpose code any gains from an advanced method are cancelled by i-cache
+    // misses during the sort, and thrashing the i-cache for surrounding code.
+    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+        insertion_sort_shift_left(v, 1, is_less);
+        return;
+    }
+
+    ipnsort(v, is_less);
+}
+
+/// See [`sort`]
+///
+/// Deliberately don't inline the main sorting routine entrypoint to ensure the
+/// inlined insertion sort i-cache footprint remains minimal.
+#[inline(never)]
+fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+    let (run_len, was_reversed) = find_existing_run(v, is_less);
+
+    // SAFETY: find_existing_run promises to return a valid run_len.
+    unsafe { intrinsics::assume(run_len <= len) };
+
+    if run_len == len {
+        if was_reversed {
+            v.reverse();
+        }
+
+        // It would be possible to a do in-place merging here for a long existing streak. But that
+        // makes the implementation a lot bigger, users can use `slice::sort` for that use-case.
+        return;
+    }
+
+    // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+    // The binary OR by one is used to eliminate the zero-check in the logarithm.
+    let limit = 2 * (len | 1).ilog2();
+    crate::slice::sort::unstable::quicksort::quicksort(v, None, limit, is_less);
+}
--- a/library/core/src/slice/sort/unstable/quicksort.rs
+++ b/library/core/src/slice/sort/unstable/quicksort.rs
@ -0,0 +1,347 @@
+//! This module contains an unstable quicksort and two partition implementations.
+
+use crate::intrinsics;
+use crate::mem::{self, ManuallyDrop};
+use crate::ptr;
+
+use crate::slice::sort::shared::pivot::choose_pivot;
+use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
+
+/// Sorts `v` recursively.
+///
+/// If the slice had a predecessor in the original array, it is specified as `ancestor_pivot`.
+///
+/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
+/// this function will immediately switch to heapsort.
+pub(crate) fn quicksort<'a, T, F>(
+    mut v: &'a mut [T],
+    mut ancestor_pivot: Option<&'a T>,
+    mut limit: u32,
+    is_less: &mut F,
+) where
+    F: FnMut(&T, &T) -> bool,
+{
+    loop {
+        if v.len() <= T::small_sort_threshold() {
+            T::small_sort(v, is_less);
+            return;
+        }
+
+        // If too many bad pivot choices were made, simply fall back to heapsort in order to
+        // guarantee `O(N x log(N))` worst-case.
+        if limit == 0 {
+            // SAFETY: We assume the `small_sort` threshold is at least 1.
+            unsafe {
+                crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
+            }
+            return;
+        }
+
+        limit -= 1;
+
+        // Choose a pivot and try guessing whether the slice is already sorted.
+        let pivot_pos = choose_pivot(v, is_less);
+
+        // If the chosen pivot is equal to the predecessor, then it's the smallest element in the
+        // slice. Partition the slice into elements equal to and elements greater than the pivot.
+        // This case is usually hit when the slice contains many duplicate elements.
+        if let Some(p) = ancestor_pivot {
+            // SAFETY: We assume choose_pivot yields an in-bounds position.
+            if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) {
+                let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a));
+
+                // Continue sorting elements greater than the pivot. We know that `num_lt` contains
+                // the pivot. So we can continue after `num_lt`.
+                v = &mut v[(num_lt + 1)..];
+                ancestor_pivot = None;
+                continue;
+            }
+        }
+
+        // Partition the slice.
+        let num_lt = partition(v, pivot_pos, is_less);
+        // SAFETY: partition ensures that `num_lt` will be in-bounds.
+        unsafe { intrinsics::assume(num_lt < v.len()) };
+
+        // Split the slice into `left`, `pivot`, and `right`.
+        let (left, right) = v.split_at_mut(num_lt);
+        let (pivot, right) = right.split_at_mut(1);
+        let pivot = &pivot[0];
+
+        // Recurse into the left side. We have a fixed recursion limit, testing shows no real
+        // benefit for recursing into the shorter side.
+        quicksort(left, ancestor_pivot, limit, is_less);
+
+        // Continue with the right side.
+        v = right;
+        ancestor_pivot = Some(pivot);
+    }
+}
+
+/// Takes the input slice `v` and re-arranges elements such that when the call returns normally
+/// all elements that compare true for `is_less(elem, pivot)` where `pivot == v[pivot_pos]` are
+/// on the left side of `v` followed by the other elements, notionally considered greater or
+/// equal to `pivot`.
+///
+/// Returns the number of elements that are compared true for `is_less(elem, pivot)`.
+///
+/// If `is_less` does not implement a total order the resulting order and return value are
+/// unspecified. All original elements will remain in `v` and any possible modifications via
+/// interior mutability will be observable. Same is true if `is_less` panics or `v.len()`
+/// exceeds `scratch.len()`.
+pub(crate) fn partition<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if len == 0 {
+        return 0;
+    }
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if pivot >= len {
+        intrinsics::abort();
+    }
+
+    // Place the pivot at the beginning of slice.
+    v.swap(0, pivot);
+    let (pivot, v_without_pivot) = v.split_at_mut(1);
+
+    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
+    // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
+    // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
+    // into a stack value, but this creates issues for types with interior mutability mandating
+    // a drop guard.
+    let pivot = &mut pivot[0];
+
+    // This construct is used to limit the LLVM IR generated, which saves large amounts of
+    // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
+    let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
+
+    // Place the pivot between the two partitions.
+    v.swap(0, num_lt);
+
+    num_lt
+}
+
+const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut F) -> usize {
+    const MAX_BRANCHLESS_PARTITION_SIZE: usize = 96;
+    if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
+        // Specialize for types that are relatively cheap to copy, where branchless optimizations
+        // have large leverage e.g. `u64` and `String`.
+        partition_lomuto_branchless_cyclic::<T, F>
+    } else {
+        partition_hoare_branchy_cyclic::<T, F>
+    }
+}
+
+/// See [`partition`].
+fn partition_hoare_branchy_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    if len == 0 {
+        return 0;
+    }
+
+    // Optimized for large types that are expensive to move. Not optimized for integers. Optimized
+    // for small code-gen, assuming that is_less is an expensive operation that generates
+    // substantial amounts of code or a call. And that copying elements will likely be a call to
+    // memcpy. Using 2 `ptr::copy_nonoverlapping` has the chance to be faster than
+    // `ptr::swap_nonoverlapping` because `memcpy` can use wide SIMD based on runtime feature
+    // detection. Benchmarks support this analysis.
+
+    let mut gap_opt: Option<GapGuard<T>> = None;
+
+    // SAFETY: The left-to-right scanning loop performs a bounds check, where we know that `left >=
+    // v_base && left < right && right <= v_base.add(len)`. The right-to-left scanning loop performs
+    // a bounds check ensuring that `right` is in-bounds. We checked that `len` is more than zero,
+    // which means that unconditional `right = right.sub(1)` is safe to do. The exit check makes
+    // sure that `left` and `right` never alias, making `ptr::copy_nonoverlapping` safe. The
+    // drop-guard `gap` ensures that should `is_less` panic we always overwrite the duplicate in the
+    // input. `gap.pos` stores the previous value of `right` and starts at `right` and so it too is
+    // in-bounds. We never pass the saved `gap.value` to `is_less` while it is inside the `GapGuard`
+    // thus any changes via interior mutability will be observed.
+    unsafe {
+        let v_base = v.as_mut_ptr();
+
+        let mut left = v_base;
+        let mut right = v_base.add(len);
+
+        loop {
+            // Find the first element greater than the pivot.
+            while left < right && is_less(&*left, pivot) {
+                left = left.add(1);
+            }
+
+            // Find the last element equal to the pivot.
+            loop {
+                right = right.sub(1);
+                if left >= right || is_less(&*right, pivot) {
+                    break;
+                }
+            }
+
+            if left >= right {
+                break;
+            }
+
+            // Swap the found pair of out-of-order elements via cyclic permutation.
+            let is_first_swap_pair = gap_opt.is_none();
+
+            if is_first_swap_pair {
+                gap_opt = Some(GapGuard { pos: right, value: ManuallyDrop::new(ptr::read(left)) });
+            }
+
+            let gap = gap_opt.as_mut().unwrap_unchecked();
+
+            // Single place where we instantiate ptr::copy_nonoverlapping in the partition.
+            if !is_first_swap_pair {
+                ptr::copy_nonoverlapping(left, gap.pos, 1);
+            }
+            gap.pos = right;
+            ptr::copy_nonoverlapping(right, left, 1);
+
+            left = left.add(1);
+        }
+
+        left.sub_ptr(v_base)
+
+        // `gap_opt` goes out of scope and overwrites the last wrong-side element on the right side
+        // with the first wrong-side element of the left side that was initially overwritten by the
+        // first wrong-side element on the right side element.
+    }
+}
+
+struct PartitionState<T> {
+    // The current element that is being looked at, scans left to right through slice.
+    right: *mut T,
+    // Counts the number of elements that compared less-than, also works around:
+    // https://github.com/rust-lang/rust/issues/117128
+    num_lt: usize,
+    // Gap guard that tracks the temporary duplicate in the input.
+    gap: GapGuardRaw<T>,
+}
+
+fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // Novel partition implementation by Lukas Bergdoll and Orson Peters. Branchless Lomuto
+    // partition paired with a cyclic permutation.
+    // https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md
+
+    let len = v.len();
+    let v_base = v.as_mut_ptr();
+
+    if len == 0 {
+        return 0;
+    }
+
+    // SAFETY: We checked that `len` is more than zero, which means that reading `v_base` is safe to
+    // do. From there we have a bounded loop where `v_base.add(i)` is guaranteed in-bounds. `v` and
+    // `pivot` can't alias because of type system rules. The drop-guard `gap` ensures that should
+    // `is_less` panic we always overwrite the duplicate in the input. `gap.pos` stores the previous
+    // value of `right` and starts at `v_base` and so it too is in-bounds. Given `UNROLL_LEN == 2`
+    // after the main loop we either have A) the last element in `v` that has not yet been processed
+    // because `len % 2 != 0`, or B) all elements have been processed except the gap value that was
+    // saved at the beginning with `ptr::read(v_base)`. In the case A) the loop will iterate twice,
+    // first performing loop_body to take care of the last element that didn't fit into the unroll.
+    // After that the behavior is the same as for B) where we use the saved value as `right` to
+    // overwrite the duplicate. If this very last call to `is_less` panics the saved value will be
+    // copied back including all possible changes via interior mutability. If `is_less` does not
+    // panic and the code continues we overwrite the duplicate and do `right = right.add(1)`, this
+    // is safe to do with `&mut *gap.value` because `T` is the same as `[T; 1]` and generating a
+    // pointer one past the allocation is safe.
+    unsafe {
+        let mut loop_body = |state: &mut PartitionState<T>| {
+            let right_is_lt = is_less(&*state.right, pivot);
+            let left = v_base.add(state.num_lt);
+
+            ptr::copy(left, state.gap.pos, 1);
+            ptr::copy_nonoverlapping(state.right, left, 1);
+
+            state.gap.pos = state.right;
+            state.num_lt += right_is_lt as usize;
+
+            state.right = state.right.add(1);
+        };
+
+        // Ideally we could just use GapGuard in PartitionState, but the reference that is
+        // materialized with `&mut state` when calling `loop_body` would create a mutable reference
+        // to the parent struct that contains the gap value, invalidating the reference pointer
+        // created from a reference to the gap value in the cleanup loop. This is only an issue
+        // under Stacked Borrows, Tree Borrows accepts the intuitive code using GapGuard as valid.
+        let mut gap_value = ManuallyDrop::new(ptr::read(v_base));
+
+        let mut state = PartitionState {
+            num_lt: 0,
+            right: v_base.add(1),
+
+            gap: GapGuardRaw { pos: v_base, value: &mut *gap_value },
+        };
+
+        // Manual unrolling that works well on x86, Arm and with opt-level=s without murdering
+        // compile-times. Leaving this to the compiler yields ok to bad results.
+        let unroll_len = const { if mem::size_of::<T>() <= 16 { 2 } else { 1 } };
+
+        let unroll_end = v_base.add(len - (unroll_len - 1));
+        while state.right < unroll_end {
+            if unroll_len == 2 {
+                loop_body(&mut state);
+                loop_body(&mut state);
+            } else {
+                loop_body(&mut state);
+            }
+        }
+
+        // Single instantiate `loop_body` for both the unroll cleanup and cyclic permutation
+        // cleanup. Optimizes binary-size and compile-time.
+        let end = v_base.add(len);
+        loop {
+            let is_done = state.right == end;
+            state.right = if is_done { state.gap.value } else { state.right };
+
+            loop_body(&mut state);
+
+            if is_done {
+                mem::forget(state.gap);
+                break;
+            }
+        }
+
+        state.num_lt
+    }
+}
+
+struct GapGuard<T> {
+    pos: *mut T,
+    value: ManuallyDrop<T>,
+}
+
+impl<T> Drop for GapGuard<T> {
+    fn drop(&mut self) {
+        unsafe {
+            ptr::copy_nonoverlapping(&*self.value, self.pos, 1);
+        }
+    }
+}
+
+/// Ideally this wouldn't be needed and we could just use the regular GapGuard.
+/// See comment in [`partition_lomuto_branchless_cyclic`].
+struct GapGuardRaw<T> {
+    pos: *mut T,
+    value: *mut T,
+}
+
+impl<T> Drop for GapGuardRaw<T> {
+    fn drop(&mut self) {
+        unsafe {
+            ptr::copy_nonoverlapping(self.value, self.pos, 1);
+        }
+    }
+}
--- a/library/core/tests/lib.rs
+++ b/library/core/tests/lib.rs
@ -49,7 +49,6 @@
 #![feature(is_sorted)]
 #![feature(layout_for_ptr)]
 #![feature(pattern)]
-#![feature(sort_internals)]
 #![feature(slice_take)]
 #![feature(slice_from_ptr_range)]
 #![feature(slice_split_once)]
--- a/library/core/tests/slice.rs
+++ b/library/core/tests/slice.rs
@ -1803,9 +1803,8 @@ fn brute_force_rotate_test_1() {
 #[test]
 #[cfg(not(target_arch = "wasm32"))]
 fn sort_unstable() {
-    use core::cmp::Ordering::{Equal, Greater, Less};
-    use core::slice::heapsort;
-    use rand::{seq::SliceRandom, Rng};
+    // use core::cmp::Ordering::{Equal, Greater, Less};
+    use rand::Rng;

    // Miri is too slow (but still need to `chain` to make the types match)
    let lens = if cfg!(miri) { (2..20).chain(0..0) } else { (2..25).chain(500..510) };
@ -1839,31 +1838,10 @@ fn sort_unstable() {
                tmp.copy_from_slice(v);
                tmp.sort_unstable_by(|a, b| b.cmp(a));
                assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
-
-                // Test heapsort using `<` operator.
-                tmp.copy_from_slice(v);
-                heapsort(tmp, |a, b| a < b);
-                assert!(tmp.windows(2).all(|w| w[0] <= w[1]));
-
-                // Test heapsort using `>` operator.
-                tmp.copy_from_slice(v);
-                heapsort(tmp, |a, b| a > b);
-                assert!(tmp.windows(2).all(|w| w[0] >= w[1]));
            }
        }
    }

-    // Sort using a completely random comparison function.
-    // This will reorder the elements *somehow*, but won't panic.
-    for i in 0..v.len() {
-        v[i] = i as i32;
-    }
-    v.sort_unstable_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap());
-    v.sort_unstable();
-    for i in 0..v.len() {
-        assert_eq!(v[i], i as i32);
-    }
-
    // Should not panic.
    [0i32; 0].sort_unstable();
    [(); 10].sort_unstable();