From c93d71e06485b93144b52db6af861c29adebb029 Mon Sep 17 00:00:00 2001
From: marc0246 <40955683+marc0246@users.noreply.github.com>
Date: Thu, 7 Sep 2023 09:39:47 +0200
Subject: [PATCH] Make the suballocators `!Sync` (#2317)

---
 vulkano/src/memory/allocator/mod.rs          |  96 ++------
 vulkano/src/memory/allocator/suballocator.rs | 223 +++++--------------
 2 files changed, 82 insertions(+), 237 deletions(-)
diff --git a/vulkano/src/memory/allocator/mod.rs b/vulkano/src/memory/allocator/mod.rs
index 65ced9ea..ae09ed2b 100644
--- a/vulkano/src/memory/allocator/mod.rs
+++ b/vulkano/src/memory/allocator/mod.rs
@@ -239,7 +239,7 @@ use crate::{
     VulkanError,
 };
 use ash::vk::{MAX_MEMORY_HEAPS, MAX_MEMORY_TYPES};
-use parking_lot::RwLock;
+use parking_lot::Mutex;
 use std::{
     error::Error,
     fmt::{Debug, Display, Error as FmtError, Formatter},
@@ -878,7 +878,7 @@ pub struct GenericMemoryAllocator<S> {
 
 #[derive(Debug)]
 struct Pool<S> {
-    blocks: RwLock<Vec<Box<Block<S>>>>,
+    blocks: Mutex<Vec<Box<Block<S>>>>,
     // This is cached here for faster access, so we don't need to hop through 3 pointers.
     memory_type: ash::vk::MemoryType,
     atom_size: DeviceAlignment,
@@ -888,7 +888,7 @@ impl<S> GenericMemoryAllocator<S> {
     // This is a false-positive, we only use this const for static initialization.
     #[allow(clippy::declare_interior_mutable_const)]
     const EMPTY_POOL: Pool<S> = Pool {
-        blocks: RwLock::new(Vec::new()),
+        blocks: Mutex::new(Vec::new()),
         memory_type: ash::vk::MemoryType {
             property_flags: ash::vk::MemoryPropertyFlags::empty(),
             heap_index: 0,
@@ -1068,7 +1068,7 @@ impl<S> GenericMemoryAllocator<S> {
     }
 }
 
-unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
+unsafe impl<S: Suballocator + Send + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
     fn find_memory_type_index(
         &self,
         memory_type_bits: u32,
@@ -1145,64 +1145,19 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
 
         layout = layout.align_to(pool.atom_size).unwrap();
 
-        let mut blocks = if S::IS_BLOCKING {
-            // If the allocation algorithm needs to block, then there's no point in trying to avoid
-            // locks here either. In that case the best strategy is to take full advantage of it by
-            // always taking an exclusive lock, which lets us sort the blocks by free size. If you
-            // as a user want to avoid locks, simply don't share the allocator between threads. You
-            // can create as many allocators as you wish, but keep in mind that that will waste a
-            // huge amount of memory unless you configure your block sizes properly!
+        let mut blocks = pool.blocks.lock();
 
-            let mut blocks = pool.blocks.write();
-            blocks.sort_by_key(|block| block.free_size());
-            let (Ok(idx) | Err(idx)) =
-                blocks.binary_search_by_key(&size, |block| block.free_size());
+        // TODO: Incremental sorting
+        blocks.sort_by_key(|block| block.free_size());
+        let (Ok(idx) | Err(idx)) = blocks.binary_search_by_key(&size, |block| block.free_size());
 
-            for block in &blocks[idx..] {
-                if let Ok(allocation) =
-                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
+        for block in &blocks[idx..] {
+            if let Ok(allocation) =
+                block.allocate(layout, allocation_type, self.buffer_image_granularity)
+            {
+                return Ok(allocation);
             }
-
-            blocks
-        } else {
-            // If the allocation algorithm is lock-free, then we should avoid taking an exclusive
-            // lock unless it is absolutely neccessary (meaning, only when allocating a new
-            // `DeviceMemory` block and inserting it into a pool). This has the disadvantage that
-            // traversing the pool is O(n), which is not a problem since the number of blocks is
-            // expected to be small. If there are more than 10 blocks in a pool then that's a
-            // configuration error. Also, sorting the blocks before each allocation would be less
-            // efficient because to get the free size of the `PoolAllocator` and `BumpAllocator`
-            // has the same performance as trying to allocate.
-
-            let blocks = pool.blocks.read();
-
-            // Search in reverse order because we always append new blocks at the end.
-            for block in blocks.iter().rev() {
-                if let Ok(allocation) =
-                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
-            }
-
-            let len = blocks.len();
-            drop(blocks);
-            let blocks = pool.blocks.write();
-
-            if blocks.len() > len {
-                // Another thread beat us to it and inserted a fresh block, try to suballocate it.
-                if let Ok(allocation) =
-                    blocks[len].allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
-            }
-
-            blocks
-        };
+        }
 
         // For bump allocators, first do a garbage sweep and try to allocate again.
         if S::NEEDS_CLEANUP {
@@ -1484,33 +1439,30 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
 
     unsafe fn deallocate(&self, allocation: MemoryAlloc) {
         if let Some(suballocation) = allocation.suballocation {
+            let memory_type_index = allocation.device_memory.memory_type_index();
+            let pool = self.pools[memory_type_index as usize].blocks.lock();
             let block_ptr = allocation.allocation_handle.0 as *const Block<S>;
 
             // TODO: Maybe do a similar check for dedicated blocks.
-            #[cfg(debug_assertions)]
-            {
-                let memory_type_index = allocation.device_memory.memory_type_index();
-                let pool = self.pools[memory_type_index as usize].blocks.read();
-
-                assert!(
-                    pool.iter()
-                        .any(|block| &**block as *const Block<S> == block_ptr),
-                    "attempted to deallocate a memory block that does not belong to this allocator",
-                );
-            }
+            debug_assert!(
+                pool.iter()
+                    .any(|block| &**block as *const Block<S> == block_ptr),
+                "attempted to deallocate a memory block that does not belong to this allocator",
+            );
 
             // SAFETY: The caller must guarantee that `allocation` refers to one allocated by
             // `self`, therefore `block_ptr` must be the same one we gave out on allocation. We
             // know that this pointer must be valid, because all blocks are boxed and pinned in
             // memory and because a block isn't dropped until the allocator itself is dropped, at
             // which point it would be impossible to call this method. We also know that it must be
-            // valid to create a reference to the block, because we only ever access it via shared
-            // references.
+            // valid to create a reference to the block, because we locked the pool it belongs to.
             let block = &*block_ptr;
 
             // SAFETY: The caller must guarantee that `allocation` refers to a currently allocated
             // allocation of `self`.
             block.deallocate(suballocation);
+
+            drop(pool);
         }
     }
 }
diff --git a/vulkano/src/memory/allocator/suballocator.rs b/vulkano/src/memory/allocator/suballocator.rs
index bb922ba4..4400a3a5 100644
--- a/vulkano/src/memory/allocator/suballocator.rs
+++ b/vulkano/src/memory/allocator/suballocator.rs
@@ -18,14 +18,12 @@ use super::{
     align_down, align_up, array_vec::ArrayVec, AllocationHandle, DeviceAlignment, DeviceLayout,
 };
 use crate::{image::ImageTiling, memory::is_aligned, DeviceSize, NonZeroDeviceSize};
-use parking_lot::Mutex;
 use std::{
-    cell::Cell,
+    cell::{Cell, UnsafeCell},
     cmp,
     error::Error,
     fmt::{self, Display},
     ptr,
-    sync::atomic::{AtomicU64, Ordering},
 };
 
 /// Suballocators are used to divide a *region* into smaller *suballocations*.
@@ -69,14 +67,6 @@ use std::{
 /// [`DeviceMemory`]: crate::memory::DeviceMemory
 /// [pages]: super#pages
 pub unsafe trait Suballocator {
-    /// Whether this allocator needs to block or not.
-    ///
-    /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
-    /// suballocator at compile time.
-    ///
-    /// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator
-    const IS_BLOCKING: bool;
-
     /// Whether the allocator needs [`cleanup`] to be called before memory can be released.
     ///
     /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
@@ -280,13 +270,11 @@ impl Display for SuballocatorError {
 pub struct FreeListAllocator {
     region_offset: DeviceSize,
     // Total memory remaining in the region.
-    free_size: AtomicU64,
-    state: Mutex<FreeListAllocatorState>,
+    free_size: Cell<DeviceSize>,
+    state: UnsafeCell<FreeListAllocatorState>,
 }
 
 unsafe impl Suballocator for FreeListAllocator {
-    const IS_BLOCKING: bool = true;
-
     const NEEDS_CLEANUP: bool = false;
 
     /// Creates a new `FreeListAllocator` for the given [region].
@@ -296,7 +284,7 @@ unsafe impl Suballocator for FreeListAllocator {
         // NOTE(Marc): This number was pulled straight out of my a-
         const AVERAGE_ALLOCATION_SIZE: DeviceSize = 64 * 1024;
 
-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);
 
         let capacity = (region_size / AVERAGE_ALLOCATION_SIZE) as usize;
         let mut nodes = host::PoolAllocator::new(capacity + 64);
@@ -309,7 +297,7 @@ unsafe impl Suballocator for FreeListAllocator {
             ty: SuballocationType::Free,
         });
         free_list.push(root_id);
-        let state = Mutex::new(FreeListAllocatorState { nodes, free_list });
+        let state = UnsafeCell::new(FreeListAllocatorState { nodes, free_list });
 
         FreeListAllocator {
             region_offset,
@@ -337,7 +325,7 @@ unsafe impl Suballocator for FreeListAllocator {
 
         let size = layout.size();
         let alignment = layout.alignment();
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
 
         unsafe {
             match state.free_list.last() {
@@ -392,7 +380,7 @@ unsafe impl Suballocator for FreeListAllocator {
 
                             // This can't overflow because suballocation sizes in the free-list are
                             // constrained by the remaining size of the region.
-                            self.free_size.fetch_sub(size, Ordering::Release);
+                            self.free_size.set(self.free_size.get() - size);
 
                             return Ok(Suballocation {
                                 offset,
@@ -421,14 +409,14 @@ unsafe impl Suballocator for FreeListAllocator {
         // allocation of `self`.
         let node_id = SlotId::new(suballocation.handle.0 as _);
 
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
         let node = state.nodes.get_mut(node_id);
 
         debug_assert!(node.ty != SuballocationType::Free);
 
         // Suballocation sizes are constrained by the size of the region, so they can't possibly
         // overflow when added up.
-        self.free_size.fetch_add(node.size, Ordering::Release);
+        self.free_size.set(self.free_size.get() + node.size);
 
         node.ty = SuballocationType::Free;
         state.coalesce(node_id);
@@ -437,7 +425,7 @@ unsafe impl Suballocator for FreeListAllocator {
 
     #[inline]
     fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
     }
 
     #[inline]
@@ -748,8 +736,8 @@ impl FreeListAllocatorState {
 pub struct BuddyAllocator {
     region_offset: DeviceSize,
     // Total memory remaining in the region.
-    free_size: AtomicU64,
-    state: Mutex<BuddyAllocatorState>,
+    free_size: Cell<DeviceSize>,
+    state: UnsafeCell<BuddyAllocatorState>,
 }
 
 impl BuddyAllocator {
@@ -761,8 +749,6 @@ impl BuddyAllocator {
 }
 
 unsafe impl Suballocator for BuddyAllocator {
-    const IS_BLOCKING: bool = true;
-
     const NEEDS_CLEANUP: bool = false;
 
     /// Creates a new `BuddyAllocator` for the given [region].
@@ -783,13 +769,13 @@ unsafe impl Suballocator for BuddyAllocator {
 
         assert!(max_order < BuddyAllocator::MAX_ORDERS);
 
-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);
 
         let mut free_list =
             ArrayVec::new(max_order + 1, [EMPTY_FREE_LIST; BuddyAllocator::MAX_ORDERS]);
         // The root node has the lowest offset and highest order, so it's the whole region.
         free_list[max_order].push(region_offset);
-        let state = Mutex::new(BuddyAllocatorState { free_list });
+        let state = UnsafeCell::new(BuddyAllocatorState { free_list });
 
         BuddyAllocator {
             region_offset,
@@ -840,7 +826,7 @@ unsafe impl Suballocator for BuddyAllocator {
         let size = cmp::max(size, BuddyAllocator::MIN_NODE_SIZE).next_power_of_two();
 
         let min_order = (size / BuddyAllocator::MIN_NODE_SIZE).trailing_zeros() as usize;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
 
         // Start searching at the lowest possible order going up.
         for (order, free_list) in state.free_list.iter_mut().enumerate().skip(min_order) {
@@ -875,7 +861,7 @@ unsafe impl Suballocator for BuddyAllocator {
 
                     // This can't overflow because suballocation sizes in the free-list are
                     // constrained by the remaining size of the region.
-                    self.free_size.fetch_sub(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() - size);
 
                     return Ok(Suballocation {
                         offset,
@@ -900,7 +886,7 @@ unsafe impl Suballocator for BuddyAllocator {
         let order = suballocation.handle.0 as usize;
 
         let min_order = order;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
 
         debug_assert!(!state.free_list[order].contains(&offset));
 
@@ -930,7 +916,7 @@ unsafe impl Suballocator for BuddyAllocator {
 
                     // The sizes of suballocations allocated by `self` are constrained by that of
                     // its region, so they can't possibly overflow when added up.
-                    self.free_size.fetch_add(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() + size);
 
                     break;
                 }
@@ -945,7 +931,7 @@ unsafe impl Suballocator for BuddyAllocator {
     /// [internal fragmentation]: super#internal-fragmentation
     #[inline]
     fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
     }
 
     #[inline]
@@ -1014,9 +1000,8 @@ struct BuddyAllocatorState {
 pub struct BumpAllocator {
     region_offset: DeviceSize,
     region_size: DeviceSize,
-    // Encodes the previous allocation type in the 2 least signifficant bits and the free start in
-    // the rest.
-    state: AtomicU64,
+    free_start: Cell<DeviceSize>,
+    prev_allocation_type: Cell<AllocationType>,
 }
 
 impl BumpAllocator {
@@ -1025,29 +1010,23 @@ impl BumpAllocator {
     /// [region]: Suballocator#regions
     #[inline]
     pub fn reset(&mut self) {
-        *self.state.get_mut() = AllocationType::Unknown as DeviceSize;
+        *self.free_start.get_mut() = 0;
+        *self.prev_allocation_type.get_mut() = AllocationType::Unknown;
     }
 }
 
 unsafe impl Suballocator for BumpAllocator {
-    const IS_BLOCKING: bool = false;
-
     const NEEDS_CLEANUP: bool = true;
 
     /// Creates a new `BumpAllocator` for the given [region].
     ///
     /// [region]: Suballocator#regions
     fn new(region_offset: DeviceSize, region_size: DeviceSize) -> Self {
-        // Sanity check: this would lead to UB because of the left-shifting by 2 needed to encode
-        // the free-start into the state.
-        assert!(region_size <= (DeviceLayout::MAX_SIZE >> 2));
-
-        let state = AtomicU64::new(AllocationType::Unknown as DeviceSize);
-
         BumpAllocator {
             region_offset,
             region_size,
-            state,
+            free_start: Cell::new(0),
+            prev_allocation_type: Cell::new(AllocationType::Unknown),
         }
     }
 
@@ -1058,97 +1037,42 @@ unsafe impl Suballocator for BumpAllocator {
         allocation_type: AllocationType,
         buffer_image_granularity: DeviceAlignment,
     ) -> Result<Suballocation, SuballocatorError> {
-        const SPIN_LIMIT: u32 = 6;
-
-        // NOTE(Marc): The following code is a minimal version `Backoff` taken from
-        // crossbeam_utils v0.8.11, because we didn't want to add a dependency for a couple lines
-        // that are used in one place only.
-        /// Original documentation:
-        /// https://docs.rs/crossbeam-utils/0.8.11/crossbeam_utils/struct.Backoff.html
-        struct Backoff {
-            step: Cell<u32>,
-        }
-
-        impl Backoff {
-            fn new() -> Self {
-                Backoff { step: Cell::new(0) }
-            }
-
-            fn spin(&self) {
-                for _ in 0..1 << self.step.get().min(SPIN_LIMIT) {
-                    core::hint::spin_loop();
-                }
-
-                if self.step.get() <= SPIN_LIMIT {
-                    self.step.set(self.step.get() + 1);
-                }
-            }
-        }
-
         fn has_granularity_conflict(prev_ty: AllocationType, ty: AllocationType) -> bool {
             prev_ty == AllocationType::Unknown || prev_ty != ty
         }
 
         let size = layout.size();
         let alignment = layout.alignment();
-        let backoff = Backoff::new();
-        let mut state = self.state.load(Ordering::Relaxed);
 
-        loop {
-            let free_start = state >> 2;
+        // These can't overflow because offsets are constrained by the size of the root
+        // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
+        let prev_end = self.region_offset + self.free_start.get();
+        let mut offset = align_up(prev_end, alignment);
 
-            // These can't overflow because offsets are constrained by the size of the root
-            // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
-            let prev_end = self.region_offset + free_start;
-            let mut offset = align_up(prev_end, alignment);
-
-            if buffer_image_granularity != DeviceAlignment::MIN {
-                let prev_alloc_type = match state & 0b11 {
-                    0 => AllocationType::Unknown,
-                    1 => AllocationType::Linear,
-                    2 => AllocationType::NonLinear,
-                    _ => unreachable!(),
-                };
-
-                if prev_end > 0
-                    && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
-                    && has_granularity_conflict(prev_alloc_type, allocation_type)
-                {
-                    offset = align_up(offset, buffer_image_granularity);
-                }
-            }
-
-            let relative_offset = offset - self.region_offset;
-
-            let free_start = relative_offset + size;
-
-            if free_start > self.region_size {
-                return Err(SuballocatorError::OutOfRegionMemory);
-            }
-
-            // This can't discard any bits because we checked that `region_size` does not exceed
-            // `DeviceLayout::MAX_SIZE >> 2`.
-            let new_state = free_start << 2 | allocation_type as DeviceSize;
-
-            match self.state.compare_exchange_weak(
-                state,
-                new_state,
-                Ordering::Release,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => {
-                    return Ok(Suballocation {
-                        offset,
-                        size,
-                        handle: AllocationHandle(ptr::null_mut()),
-                    });
-                }
-                Err(new_state) => {
-                    state = new_state;
-                    backoff.spin();
-                }
-            }
+        if buffer_image_granularity != DeviceAlignment::MIN
+            && prev_end > 0
+            && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
+            && has_granularity_conflict(self.prev_allocation_type.get(), allocation_type)
+        {
+            offset = align_up(offset, buffer_image_granularity);
         }
+
+        let relative_offset = offset - self.region_offset;
+
+        let free_start = relative_offset + size;
+
+        if free_start > self.region_size {
+            return Err(SuballocatorError::OutOfRegionMemory);
+        }
+
+        self.free_start.set(free_start);
+        self.prev_allocation_type.set(allocation_type);
+
+        Ok(Suballocation {
+            offset,
+            size,
+            handle: AllocationHandle(ptr::null_mut()),
+        })
     }
 
     #[inline]
@@ -1158,7 +1082,7 @@ unsafe impl Suballocator for BumpAllocator {
 
     #[inline]
     fn free_size(&self) -> DeviceSize {
-        self.region_size - (self.state.load(Ordering::Acquire) >> 2)
+        self.region_size - self.free_start.get()
     }
 
     #[inline]
@@ -1303,6 +1227,7 @@ mod host {
 mod tests {
     use super::*;
     use crossbeam_queue::ArrayQueue;
+    use parking_lot::Mutex;
     use std::thread;
 
     const fn unwrap<T: Copy>(opt: Option<T>) -> T {
@@ -1322,7 +1247,7 @@ mod tests {
         const REGION_SIZE: DeviceSize =
             (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
 
-        let allocator = FreeListAllocator::new(0, REGION_SIZE);
+        let allocator = Mutex::new(FreeListAllocator::new(0, REGION_SIZE));
         let allocs = ArrayQueue::new((ALLOCATIONS_PER_THREAD * THREADS) as usize);
 
         // Using threads to randomize allocation order.
@@ -1337,6 +1262,7 @@ mod tests {
                         allocs
                             .push(
                                 allocator
+                                    .lock()
                                     .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
                                     .unwrap(),
                             )
@@ -1346,6 +1272,8 @@ mod tests {
             }
         });
 
+        let allocator = allocator.into_inner();
+
         assert!(allocator
             .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
             .is_err());
@@ -1709,39 +1637,4 @@ mod tests {
         allocator.reset();
         assert!(allocator.free_size() == REGION_SIZE);
     }
-
-    #[test]
-    fn bump_allocator_syncness() {
-        const THREADS: DeviceSize = 12;
-        const ALLOCATIONS_PER_THREAD: DeviceSize = 100_000;
-        const ALLOCATION_STEP: DeviceSize = 117;
-        const REGION_SIZE: DeviceSize =
-            (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
-
-        let mut allocator = BumpAllocator::new(0, REGION_SIZE);
-
-        thread::scope(|scope| {
-            for i in 1..=THREADS {
-                let allocator = &allocator;
-
-                scope.spawn(move || {
-                    let layout = DeviceLayout::from_size_alignment(i * ALLOCATION_STEP, 1).unwrap();
-
-                    for _ in 0..ALLOCATIONS_PER_THREAD {
-                        allocator
-                            .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
-                            .unwrap();
-                    }
-                });
-            }
-        });
-
-        assert!(allocator
-            .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
-            .is_err());
-        assert!(allocator.free_size() == 0);
-
-        allocator.reset();
-        assert!(allocator.free_size() == REGION_SIZE);
-    }
 }