Unify stable and unstable sort implementations in same core module

This moves the stable sort implementation to the core::slice::sort module. By virtue of being in core it can't access `Vec`. The two `Vec` used by merge sort, `buf` and `runs`, are modelled as custom types that implement the very limited required `Vec` interface with the help of provided allocation and free functions. This is done to allow future re-use of functions and logic between stable and unstable sort. Such as `insert_head`.
2025-06-05 03:38:29 +00:00 · 2022-11-20 20:35:40 +01:00 · 2022-11-20 20:35:40 +01:00 · dbc0ed2a10
commit dbc0ed2a10
parent 736c675d2a
3 changed files with 540 additions and 310 deletions
--- a/library/alloc/src/slice.rs
+++ b/library/alloc/src/slice.rs
@ -19,10 +19,12 @@ use core::cmp::Ordering::{self, Less};
 use core::mem::{self, SizedTypeProperties};
 #[cfg(not(no_global_oom_handling))]
 use core::ptr;
 #[cfg(not(no_global_oom_handling))]
 use core::slice::sort;
 use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
-use crate::alloc::Global;
+use crate::alloc::{self, Global};
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::ToOwned;
 use crate::boxed::Box;
@ -203,7 +205,7 @@ impl<T> [T] {
    where
        T: Ord,
    {
-        merge_sort(self, T::lt);
+        stable_sort(self, T::lt);
    }
    /// Sorts the slice with a comparator function.
@ -259,7 +261,7 @@ impl<T> [T] {
    where
        F: FnMut(&T, &T) -> Ordering,
    {
-        merge_sort(self, |a, b| compare(a, b) == Less);
+        stable_sort(self, |a, b| compare(a, b) == Less);
    }
    /// Sorts the slice with a key extraction function.
@ -302,7 +304,7 @@ impl<T> [T] {
        F: FnMut(&T) -> K,
        K: Ord,
    {
-        merge_sort(self, |a, b| f(a).lt(&f(b)));
+        stable_sort(self, |a, b| f(a).lt(&f(b)));
    }
    /// Sorts the slice with a key extraction function.
@ -809,324 +811,52 @@ impl<T: Clone> ToOwned for [T] {
 // Sorting
 ////////////////////////////////////////////////////////////////////////////////
-/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
+#[inline]
 ///
 /// This is the integral subroutine of insertion sort.
 #[cfg(not(no_global_oom_handling))]
-fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
+fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
 where
    F: FnMut(&T, &T) -> bool,
 {
    if v.len() >= 2 && is_less(&v[1], &v[0]) {
        unsafe {
            // There are three ways to implement insertion here:
            //
            // 1. Swap adjacent elements until the first one gets to its final destination.
            //    However, this way we copy data around more than is necessary. If elements are big
            //    structures (costly to copy), this method will be slow.
            //
            // 2. Iterate until the right place for the first element is found. Then shift the
            //    elements succeeding it to make room for it and finally place it into the
            //    remaining hole. This is a good method.
            //
            // 3. Copy the first element into a temporary variable. Iterate until the right place
            //    for it is found. As we go along, copy every traversed element into the slot
            //    preceding it. Finally, copy data from the temporary variable into the remaining
            //    hole. This method is very good. Benchmarks demonstrated slightly better
            //    performance than with the 2nd method.
            //
            // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
            let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
            // Intermediate state of the insertion process is always tracked by `hole`, which
            // serves two purposes:
            // 1. Protects integrity of `v` from panics in `is_less`.
            // 2. Fills the remaining hole in `v` in the end.
            //
            // Panic safety:
            //
            // If `is_less` panics at any point during the process, `hole` will get dropped and
            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
            // initially held exactly once.
            let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
            ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
            for i in 2..v.len() {
                if !is_less(&v[i], &*tmp) {
                    break;
                }
                ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
                hole.dest = &mut v[i];
            }
            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
        }
    }
    // When dropped, copies from `src` into `dest`.
    struct InsertionHole<T> {
        src: *const T,
        dest: *mut T,
    }
    impl<T> Drop for InsertionHole<T> {
        fn drop(&mut self) {
            unsafe {
                ptr::copy_nonoverlapping(self.src, self.dest, 1);
            }
        }
    }
 }
 /// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
 /// stores the result into `v[..]`.
 ///
 /// # Safety
 ///
 /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
 /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
 #[cfg(not(no_global_oom_handling))]
 unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
 where
    F: FnMut(&T, &T) -> bool,
 {
    let len = v.len();
    let v = v.as_mut_ptr();
    let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
    // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
    // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
    // copying the lesser (or greater) one into `v`.
    //
    // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
    // consumed first, then we must copy whatever is left of the shorter run into the remaining
    // hole in `v`.
    //
    // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
    // 1. Protects integrity of `v` from panics in `is_less`.
    // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
    //
    // Panic safety:
    //
    // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
    // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
    // object it initially held exactly once.
    let mut hole;
    if mid <= len - mid {
        // The left run is shorter.
        unsafe {
            ptr::copy_nonoverlapping(v, buf, mid);
            hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
        }
        // Initially, these pointers point to the beginnings of their arrays.
        let left = &mut hole.start;
        let mut right = v_mid;
        let out = &mut hole.dest;
        while *left < hole.end && right < v_end {
            // Consume the lesser side.
            // If equal, prefer the left run to maintain stability.
            unsafe {
                let to_copy = if is_less(&*right, &**left) {
                    get_and_increment(&mut right)
                } else {
                    get_and_increment(left)
                };
                ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
            }
        }
    } else {
        // The right run is shorter.
        unsafe {
            ptr::copy_nonoverlapping(v_mid, buf, len - mid);
            hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
        }
        // Initially, these pointers point past the ends of their arrays.
        let left = &mut hole.dest;
        let right = &mut hole.end;
        let mut out = v_end;
        while v < *left && buf < *right {
            // Consume the greater side.
            // If equal, prefer the right run to maintain stability.
            unsafe {
                let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
                    decrement_and_get(left)
                } else {
                    decrement_and_get(right)
                };
                ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
            }
        }
    }
    // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
    // it will now be copied into the hole in `v`.
    unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
        let old = *ptr;
        *ptr = unsafe { ptr.add(1) };
        old
    }
    unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
        *ptr = unsafe { ptr.sub(1) };
        *ptr
    }
    // When dropped, copies the range `start..end` into `dest..`.
    struct MergeHole<T> {
        start: *mut T,
        end: *mut T,
        dest: *mut T,
    }
    impl<T> Drop for MergeHole<T> {
        fn drop(&mut self) {
            // `T` is not a zero-sized type, and these are pointers into a slice's elements.
            unsafe {
                let len = self.end.sub_ptr(self.start);
                ptr::copy_nonoverlapping(self.start, self.dest, len);
            }
        }
    }
 }
 /// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
 /// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
 ///
 /// The algorithm identifies strictly descending and non-descending subsequences, which are called
 /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
 /// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
 /// satisfied:
 ///
 /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
 ///
 /// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
 #[cfg(not(no_global_oom_handling))]
 fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
 where
    F: FnMut(&T, &T) -> bool,
 {
    // Slices of up to this length get sorted using insertion sort.
    const MAX_INSERTION: usize = 20;
    // Very short runs are extended using insertion sort to span at least this many elements.
    const MIN_RUN: usize = 10;
    // Sorting has no meaningful behavior on zero-sized types.
    if T::IS_ZST {
        // Sorting has no meaningful behavior on zero-sized types. Do nothing.
        return;
    }
-    let len = v.len();
+    let elem_alloc_fn = |len: usize| -> *mut T {
        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
        // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
        // elements.
        unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
    };
-    // Short arrays get sorted in-place via insertion sort to avoid allocations.
+    let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
-    if len <= MAX_INSERTION {
+        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
-        if len >= 2 {
+        // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-            for i in (0..len - 1).rev() {
+        // len.
-                insert_head(&mut v[i..], &mut is_less);
+        unsafe {
-            }
+            alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
        }
-        return;
+    };
    }
-    // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
+    let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
-    // shallow copies of the contents of `v` without risking the dtors running on copies if
+        // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
-    // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
+        // obscene length or 0.
-    // which will always have length at most `len / 2`.
+        unsafe {
-    let mut buf = Vec::with_capacity(len / 2);
+            alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
-
+                as *mut sort::TimSortRun
    // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
    // strange decision, but consider the fact that merges more often go in the opposite direction
    // (forwards). According to benchmarks, merging forwards is slightly faster than merging
    // backwards. To conclude, identifying runs by traversing backwards improves performance.
    let mut runs = vec![];
    let mut end = len;
    while end > 0 {
        // Find the next natural run, and reverse it if it's strictly descending.
        let mut start = end - 1;
        if start > 0 {
            start -= 1;
            unsafe {
                if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
                    while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
                        start -= 1;
                    }
                    v[start..end].reverse();
                } else {
                    while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
                    {
                        start -= 1;
                    }
                }
            }
        }
    };
-        // Insert some more elements into the run if it's too short. Insertion sort is faster than
+    let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
-        // merge sort on short sequences, so this significantly improves performance.
+        // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
-        while start > 0 && end - start < MIN_RUN {
+        // len.
-            start -= 1;
+        unsafe {
-            insert_head(&mut v[start..end], &mut is_less);
+            alloc::dealloc(
                buf_ptr as *mut u8,
                alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
            );
        }
    };
-        // Push this run onto the stack.
+    sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
        runs.push(Run { start, len: end - start });
        end = start;
        // Merge some pairs of adjacent runs to satisfy the invariants.
        while let Some(r) = collapse(&runs) {
            let left = runs[r + 1];
            let right = runs[r];
            unsafe {
                merge(
                    &mut v[left.start..right.start + right.len],
                    left.len,
                    buf.as_mut_ptr(),
                    &mut is_less,
                );
            }
            runs[r] = Run { start: left.start, len: left.len + right.len };
            runs.remove(r + 1);
        }
    }
    // Finally, exactly one run must remain in the stack.
    debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
    // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
    // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
    // algorithm should continue building a new run instead, `None` is returned.
    //
    // TimSort is infamous for its buggy implementations, as described here:
    // http://envisage-project.eu/timsort-specification-and-verification/
    //
    // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
    // Enforcing them on just top three is not sufficient to ensure that the invariants will still
    // hold for *all* runs in the stack.
    //
    // This function correctly checks invariants for the top four runs. Additionally, if the top
    // run starts at index 0, it will always demand a merge operation until the stack is fully
    // collapsed, in order to complete the sort.
    #[inline]
    fn collapse(runs: &[Run]) -> Option<usize> {
        let n = runs.len();
        if n >= 2
            && (runs[n - 1].start == 0
                || runs[n - 2].len <= runs[n - 1].len
                || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
                || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
        {
            if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
        } else {
            None
        }
    }
    #[derive(Clone, Copy)]
    struct Run {
        start: usize,
        len: usize,
    }
 }
--- a/library/core/src/slice/mod.rs
+++ b/library/core/src/slice/mod.rs
@ -28,13 +28,19 @@ use crate::slice;
 /// Pure rust memchr implementation, taken from rust-memchr
 pub mod memchr;
 #[unstable(
    feature = "slice_internals",
    issue = "none",
    reason = "exposed from core to be reused in std;"
 )]
 pub mod sort;
 mod ascii;
 mod cmp;
 mod index;
 mod iter;
 mod raw;
 mod rotate;
 mod sort;
 mod specialize;
 #[stable(feature = "rust1", since = "1.0.0")]
--- a/library/core/src/slice/sort.rs
+++ b/library/core/src/slice/sort.rs
@ -5,6 +5,11 @@
 //!
 //! Unstable sorting is compatible with libcore because it doesn't allocate memory, unlike our
 //! stable sorting implementation.
 //!
 //! In addition it also contains the core logic of the stable sort used by `slice::sort` based on
 //! TimSort.
 #![allow(unused)] // FIXME debug
 use crate::cmp;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
@ -883,6 +888,7 @@ fn partition_at_index_loop<'a, T, F>(
    }
 }
 /// Reorder the slice such that the element at `index` is at its final sorted position.
 pub fn partition_at_index<T, F>(
    v: &mut [T],
    index: usize,
@ -927,3 +933,491 @@ where
    let pivot = &mut pivot[0];
    (left, pivot, right)
 }
 /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
 ///
 /// This is the integral subroutine of insertion sort.
 fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
 where
    F: FnMut(&T, &T) -> bool,
 {
    if v.len() >= 2 && is_less(&v[1], &v[0]) {
        unsafe {
            // There are three ways to implement insertion here:
            //
            // 1. Swap adjacent elements until the first one gets to its final destination.
            //    However, this way we copy data around more than is necessary. If elements are big
            //    structures (costly to copy), this method will be slow.
            //
            // 2. Iterate until the right place for the first element is found. Then shift the
            //    elements succeeding it to make room for it and finally place it into the
            //    remaining hole. This is a good method.
            //
            // 3. Copy the first element into a temporary variable. Iterate until the right place
            //    for it is found. As we go along, copy every traversed element into the slot
            //    preceding it. Finally, copy data from the temporary variable into the remaining
            //    hole. This method is very good. Benchmarks demonstrated slightly better
            //    performance than with the 2nd method.
            //
            // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
            let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
            // Intermediate state of the insertion process is always tracked by `hole`, which
            // serves two purposes:
            // 1. Protects integrity of `v` from panics in `is_less`.
            // 2. Fills the remaining hole in `v` in the end.
            //
            // Panic safety:
            //
            // If `is_less` panics at any point during the process, `hole` will get dropped and
            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
            // initially held exactly once.
            let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
            ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
            for i in 2..v.len() {
                if !is_less(&v[i], &*tmp) {
                    break;
                }
                ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
                hole.dest = &mut v[i];
            }
            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
        }
    }
    // When dropped, copies from `src` into `dest`.
    struct InsertionHole<T> {
        src: *const T,
        dest: *mut T,
    }
    impl<T> Drop for InsertionHole<T> {
        fn drop(&mut self) {
            unsafe {
                ptr::copy_nonoverlapping(self.src, self.dest, 1);
            }
        }
    }
 }
 /// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
 /// stores the result into `v[..]`.
 ///
 /// # Safety
 ///
 /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
 /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
 unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
 where
    F: FnMut(&T, &T) -> bool,
 {
    let len = v.len();
    let v = v.as_mut_ptr();
    let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
    // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
    // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
    // copying the lesser (or greater) one into `v`.
    //
    // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
    // consumed first, then we must copy whatever is left of the shorter run into the remaining
    // hole in `v`.
    //
    // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
    // 1. Protects integrity of `v` from panics in `is_less`.
    // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
    //
    // Panic safety:
    //
    // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
    // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
    // object it initially held exactly once.
    let mut hole;
    if mid <= len - mid {
        // The left run is shorter.
        unsafe {
            ptr::copy_nonoverlapping(v, buf, mid);
            hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
        }
        // Initially, these pointers point to the beginnings of their arrays.
        let left = &mut hole.start;
        let mut right = v_mid;
        let out = &mut hole.dest;
        while *left < hole.end && right < v_end {
            // Consume the lesser side.
            // If equal, prefer the left run to maintain stability.
            unsafe {
                let to_copy = if is_less(&*right, &**left) {
                    get_and_increment(&mut right)
                } else {
                    get_and_increment(left)
                };
                ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
            }
        }
    } else {
        // The right run is shorter.
        unsafe {
            ptr::copy_nonoverlapping(v_mid, buf, len - mid);
            hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
        }
        // Initially, these pointers point past the ends of their arrays.
        let left = &mut hole.dest;
        let right = &mut hole.end;
        let mut out = v_end;
        while v < *left && buf < *right {
            // Consume the greater side.
            // If equal, prefer the right run to maintain stability.
            unsafe {
                let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
                    decrement_and_get(left)
                } else {
                    decrement_and_get(right)
                };
                ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
            }
        }
    }
    // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
    // it will now be copied into the hole in `v`.
    unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
        let old = *ptr;
        *ptr = unsafe { ptr.add(1) };
        old
    }
    unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
        *ptr = unsafe { ptr.sub(1) };
        *ptr
    }
    // When dropped, copies the range `start..end` into `dest..`.
    struct MergeHole<T> {
        start: *mut T,
        end: *mut T,
        dest: *mut T,
    }
    impl<T> Drop for MergeHole<T> {
        fn drop(&mut self) {
            // `T` is not a zero-sized type, and these are pointers into a slice's elements.
            unsafe {
                let len = self.end.sub_ptr(self.start);
                ptr::copy_nonoverlapping(self.start, self.dest, len);
            }
        }
    }
 }
 /// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in
 /// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python
 /// has switched to a Powersort based implementation.
 ///
 /// The algorithm identifies strictly descending and non-descending subsequences, which are called
 /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
 /// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
 /// satisfied:
 ///
 /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
 ///
 /// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
 pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
    v: &mut [T],
    is_less: &mut CmpF,
    elem_alloc_fn: ElemAllocF,
    elem_dealloc_fn: ElemDeallocF,
    run_alloc_fn: RunAllocF,
    run_dealloc_fn: RunDeallocF,
 ) where
    CmpF: FnMut(&T, &T) -> bool,
    ElemAllocF: Fn(usize) -> *mut T,
    ElemDeallocF: Fn(*mut T, usize),
    RunAllocF: Fn(usize) -> *mut TimSortRun,
    RunDeallocF: Fn(*mut TimSortRun, usize),
 {
    // Slices of up to this length get sorted using insertion sort.
    const MAX_INSERTION: usize = 20;
    // Very short runs are extended using insertion sort to span at least this many elements.
    const MIN_RUN: usize = 10;
    // The caller should have already checked that.
    debug_assert!(!T::IS_ZST);
    let len = v.len();
    // Short arrays get sorted in-place via insertion sort to avoid allocations.
    if len <= MAX_INSERTION {
        if len >= 2 {
            for i in (0..len - 1).rev() {
                insert_head(&mut v[i..], is_less);
            }
        }
        return;
    }
    // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
    // shallow copies of the contents of `v` without risking the dtors running on copies if
    // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
    // which will always have length at most `len / 2`.
    let mut buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn);
    let buf_ptr = buf.buf_ptr;
    let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
    // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
    // strange decision, but consider the fact that merges more often go in the opposite direction
    // (forwards). According to benchmarks, merging forwards is slightly faster than merging
    // backwards. To conclude, identifying runs by traversing backwards improves performance.
    let mut end = len;
    while end > 0 {
        // Find the next natural run, and reverse it if it's strictly descending.
        let mut start = end - 1;
        if start > 0 {
            start -= 1;
            unsafe {
                if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
                    while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
                        start -= 1;
                    }
                    v[start..end].reverse();
                } else {
                    while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
                    {
                        start -= 1;
                    }
                }
            }
        }
        // Insert some more elements into the run if it's too short. Insertion sort is faster than
        // merge sort on short sequences, so this significantly improves performance.
        while start > 0 && end - start < MIN_RUN {
            start -= 1;
            insert_head(&mut v[start..end], is_less);
        }
        // Push this run onto the stack.
        runs.push(TimSortRun { start, len: end - start });
        end = start;
        // Merge some pairs of adjacent runs to satisfy the invariants.
        while let Some(r) = collapse(runs.as_slice()) {
            let left = runs[r + 1];
            let right = runs[r];
            unsafe {
                merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);
            }
            runs[r] = TimSortRun { start: left.start, len: left.len + right.len };
            runs.remove(r + 1);
        }
    }
    // Finally, exactly one run must remain in the stack.
    debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
    // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
    // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
    // algorithm should continue building a new run instead, `None` is returned.
    //
    // TimSort is infamous for its buggy implementations, as described here:
    // http://envisage-project.eu/timsort-specification-and-verification/
    //
    // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
    // Enforcing them on just top three is not sufficient to ensure that the invariants will still
    // hold for *all* runs in the stack.
    //
    // This function correctly checks invariants for the top four runs. Additionally, if the top
    // run starts at index 0, it will always demand a merge operation until the stack is fully
    // collapsed, in order to complete the sort.
    #[inline]
    fn collapse(runs: &[TimSortRun]) -> Option<usize> {
        let n = runs.len();
        if n >= 2
            && (runs[n - 1].start == 0
                || runs[n - 2].len <= runs[n - 1].len
                || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
                || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
        {
            if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
        } else {
            None
        }
    }
    // Extremely basic versions of Vec.
    // Their use is super limited and by having the code here, it allows reuse between the sort
    // implementations.
    struct BufGuard<T, ElemAllocF, ElemDeallocF>
    where
        ElemAllocF: Fn(usize) -> *mut T,
        ElemDeallocF: Fn(*mut T, usize),
    {
        buf_ptr: *mut T,
        capacity: usize,
        elem_alloc_fn: ElemAllocF,
        elem_dealloc_fn: ElemDeallocF,
    }
    impl<T, ElemAllocF, ElemDeallocF> BufGuard<T, ElemAllocF, ElemDeallocF>
    where
        ElemAllocF: Fn(usize) -> *mut T,
        ElemDeallocF: Fn(*mut T, usize),
    {
        fn new(len: usize, elem_alloc_fn: ElemAllocF, elem_dealloc_fn: ElemDeallocF) -> Self {
            Self { buf_ptr: elem_alloc_fn(len), capacity: len, elem_alloc_fn, elem_dealloc_fn }
        }
    }
    impl<T, ElemAllocF, ElemDeallocF> Drop for BufGuard<T, ElemAllocF, ElemDeallocF>
    where
        ElemAllocF: Fn(usize) -> *mut T,
        ElemDeallocF: Fn(*mut T, usize),
    {
        fn drop(&mut self) {
            (self.elem_dealloc_fn)(self.buf_ptr, self.capacity);
        }
    }
    struct RunVec<RunAllocF, RunDeallocF>
    where
        RunAllocF: Fn(usize) -> *mut TimSortRun,
        RunDeallocF: Fn(*mut TimSortRun, usize),
    {
        buf_ptr: *mut TimSortRun,
        capacity: usize,
        len: usize,
        run_alloc_fn: RunAllocF,
        run_dealloc_fn: RunDeallocF,
    }
    impl<RunAllocF, RunDeallocF> RunVec<RunAllocF, RunDeallocF>
    where
        RunAllocF: Fn(usize) -> *mut TimSortRun,
        RunDeallocF: Fn(*mut TimSortRun, usize),
    {
        fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self {
            // Most slices can be sorted with at most 16 runs in-flight.
            const START_RUN_CAPACITY: usize = 16;
            Self {
                buf_ptr: run_alloc_fn(START_RUN_CAPACITY),
                capacity: START_RUN_CAPACITY,
                len: 0,
                run_alloc_fn,
                run_dealloc_fn,
            }
        }
        fn push(&mut self, val: TimSortRun) {
            if self.len == self.capacity {
                let old_capacity = self.capacity;
                let old_buf_ptr = self.buf_ptr;
                self.capacity = self.capacity * 2;
                self.buf_ptr = (self.run_alloc_fn)(self.capacity);
                // SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has
                // old_capacity valid elements.
                unsafe {
                    ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr, old_capacity);
                }
                (self.run_dealloc_fn)(old_buf_ptr, old_capacity);
            }
            // SAFETY: The invariant was just checked.
            unsafe {
                self.buf_ptr.add(self.len).write(val);
            }
            self.len += 1;
        }
        fn remove(&mut self, index: usize) {
            if index >= self.len {
                panic!("Index out of bounds");
            }
            // SAFETY: buf_ptr needs to be valid and len invariant upheld.
            unsafe {
                // the place we are taking from.
                let ptr = self.buf_ptr.add(index);
                // Shift everything down to fill in that spot.
                ptr::copy(ptr.add(1), ptr, self.len - index - 1);
            }
            self.len -= 1;
        }
        fn as_slice(&self) -> &[TimSortRun] {
            // SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld.
            unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr, self.len) }
        }
        fn len(&self) -> usize {
            self.len
        }
    }
    impl<RunAllocF, RunDeallocF> core::ops::Index<usize> for RunVec<RunAllocF, RunDeallocF>
    where
        RunAllocF: Fn(usize) -> *mut TimSortRun,
        RunDeallocF: Fn(*mut TimSortRun, usize),
    {
        type Output = TimSortRun;
        fn index(&self, index: usize) -> &Self::Output {
            if index < self.len {
                // SAFETY: buf_ptr and len invariant must be upheld.
                unsafe {
                    return &*(self.buf_ptr.add(index));
                }
            }
            panic!("Index out of bounds");
        }
    }
    impl<RunAllocF, RunDeallocF> core::ops::IndexMut<usize> for RunVec<RunAllocF, RunDeallocF>
    where
        RunAllocF: Fn(usize) -> *mut TimSortRun,
        RunDeallocF: Fn(*mut TimSortRun, usize),
    {
        fn index_mut(&mut self, index: usize) -> &mut Self::Output {
            if index < self.len {
                // SAFETY: buf_ptr and len invariant must be upheld.
                unsafe {
                    return &mut *(self.buf_ptr.add(index));
                }
            }
            panic!("Index out of bounds");
        }
    }
    impl<RunAllocF, RunDeallocF> Drop for RunVec<RunAllocF, RunDeallocF>
    where
        RunAllocF: Fn(usize) -> *mut TimSortRun,
        RunDeallocF: Fn(*mut TimSortRun, usize),
    {
        fn drop(&mut self) {
            // As long as TimSortRun is Copy we don't need to drop them individually but just the
            // whole allocation.
            (self.run_dealloc_fn)(self.buf_ptr, self.capacity);
        }
    }
 }
 /// Internal type used by merge_sort.
 #[derive(Clone, Copy, Debug)]
 pub struct TimSortRun {
    len: usize,
    start: usize,
 }