Make the suballocators !Sync (#2317)

2024-11-21 22:34:43 +00:00 · 2023-09-07 09:39:47 +02:00 · 2023-09-07 09:39:47 +02:00 · c93d71e064
commit c93d71e064
parent 64ea44c25e
2 changed files with 82 additions and 237 deletions
--- a/vulkano/src/memory/allocator/mod.rs
+++ b/vulkano/src/memory/allocator/mod.rs
@ -239,7 +239,7 @@ use crate::{
    VulkanError,
 };
 use ash::vk::{MAX_MEMORY_HEAPS, MAX_MEMORY_TYPES};
-use parking_lot::RwLock;
+use parking_lot::Mutex;
 use std::{
    error::Error,
    fmt::{Debug, Display, Error as FmtError, Formatter},
@ -878,7 +878,7 @@ pub struct GenericMemoryAllocator<S> {

 #[derive(Debug)]
 struct Pool<S> {
-    blocks: RwLock<Vec<Box<Block<S>>>>,
+    blocks: Mutex<Vec<Box<Block<S>>>>,
    // This is cached here for faster access, so we don't need to hop through 3 pointers.
    memory_type: ash::vk::MemoryType,
    atom_size: DeviceAlignment,
@ -888,7 +888,7 @@ impl<S> GenericMemoryAllocator<S> {
    // This is a false-positive, we only use this const for static initialization.
    #[allow(clippy::declare_interior_mutable_const)]
    const EMPTY_POOL: Pool<S> = Pool {
-        blocks: RwLock::new(Vec::new()),
+        blocks: Mutex::new(Vec::new()),
        memory_type: ash::vk::MemoryType {
            property_flags: ash::vk::MemoryPropertyFlags::empty(),
            heap_index: 0,
@ -1068,7 +1068,7 @@ impl<S> GenericMemoryAllocator<S> {
    }
 }

-unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
+unsafe impl<S: Suballocator + Send + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
    fn find_memory_type_index(
        &self,
        memory_type_bits: u32,
@ -1145,64 +1145,19 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic

        layout = layout.align_to(pool.atom_size).unwrap();

-        let mut blocks = if S::IS_BLOCKING {
-            // If the allocation algorithm needs to block, then there's no point in trying to avoid
-            // locks here either. In that case the best strategy is to take full advantage of it by
-            // always taking an exclusive lock, which lets us sort the blocks by free size. If you
-            // as a user want to avoid locks, simply don't share the allocator between threads. You
-            // can create as many allocators as you wish, but keep in mind that that will waste a
-            // huge amount of memory unless you configure your block sizes properly!
+        let mut blocks = pool.blocks.lock();

-            let mut blocks = pool.blocks.write();
-            blocks.sort_by_key(|block| block.free_size());
-            let (Ok(idx) | Err(idx)) =
-                blocks.binary_search_by_key(&size, |block| block.free_size());
+        // TODO: Incremental sorting
+        blocks.sort_by_key(|block| block.free_size());
+        let (Ok(idx) | Err(idx)) = blocks.binary_search_by_key(&size, |block| block.free_size());

-            for block in &blocks[idx..] {
-                if let Ok(allocation) =
-                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
+        for block in &blocks[idx..] {
+            if let Ok(allocation) =
+                block.allocate(layout, allocation_type, self.buffer_image_granularity)
+            {
+                return Ok(allocation);
            }
-
-            blocks
-        } else {
-            // If the allocation algorithm is lock-free, then we should avoid taking an exclusive
-            // lock unless it is absolutely neccessary (meaning, only when allocating a new
-            // `DeviceMemory` block and inserting it into a pool). This has the disadvantage that
-            // traversing the pool is O(n), which is not a problem since the number of blocks is
-            // expected to be small. If there are more than 10 blocks in a pool then that's a
-            // configuration error. Also, sorting the blocks before each allocation would be less
-            // efficient because to get the free size of the `PoolAllocator` and `BumpAllocator`
-            // has the same performance as trying to allocate.
-
-            let blocks = pool.blocks.read();
-
-            // Search in reverse order because we always append new blocks at the end.
-            for block in blocks.iter().rev() {
-                if let Ok(allocation) =
-                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
-            }
-
-            let len = blocks.len();
-            drop(blocks);
-            let blocks = pool.blocks.write();
-
-            if blocks.len() > len {
-                // Another thread beat us to it and inserted a fresh block, try to suballocate it.
-                if let Ok(allocation) =
-                    blocks[len].allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
-                    return Ok(allocation);
-                }
-            }
-
-            blocks
-        };
+        }

        // For bump allocators, first do a garbage sweep and try to allocate again.
        if S::NEEDS_CLEANUP {
@ -1484,33 +1439,30 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic

    unsafe fn deallocate(&self, allocation: MemoryAlloc) {
        if let Some(suballocation) = allocation.suballocation {
+            let memory_type_index = allocation.device_memory.memory_type_index();
+            let pool = self.pools[memory_type_index as usize].blocks.lock();
            let block_ptr = allocation.allocation_handle.0 as *const Block<S>;

            // TODO: Maybe do a similar check for dedicated blocks.
-            #[cfg(debug_assertions)]
-            {
-                let memory_type_index = allocation.device_memory.memory_type_index();
-                let pool = self.pools[memory_type_index as usize].blocks.read();
-
-                assert!(
-                    pool.iter()
-                        .any(|block| &**block as *const Block<S> == block_ptr),
-                    "attempted to deallocate a memory block that does not belong to this allocator",
-                );
-            }
+            debug_assert!(
+                pool.iter()
+                    .any(|block| &**block as *const Block<S> == block_ptr),
+                "attempted to deallocate a memory block that does not belong to this allocator",
+            );

            // SAFETY: The caller must guarantee that `allocation` refers to one allocated by
            // `self`, therefore `block_ptr` must be the same one we gave out on allocation. We
            // know that this pointer must be valid, because all blocks are boxed and pinned in
            // memory and because a block isn't dropped until the allocator itself is dropped, at
            // which point it would be impossible to call this method. We also know that it must be
-            // valid to create a reference to the block, because we only ever access it via shared
-            // references.
+            // valid to create a reference to the block, because we locked the pool it belongs to.
            let block = &*block_ptr;

            // SAFETY: The caller must guarantee that `allocation` refers to a currently allocated
            // allocation of `self`.
            block.deallocate(suballocation);
+
+            drop(pool);
        }
    }
 }
--- a/vulkano/src/memory/allocator/suballocator.rs
+++ b/vulkano/src/memory/allocator/suballocator.rs
@ -18,14 +18,12 @@ use super::{
    align_down, align_up, array_vec::ArrayVec, AllocationHandle, DeviceAlignment, DeviceLayout,
 };
 use crate::{image::ImageTiling, memory::is_aligned, DeviceSize, NonZeroDeviceSize};
-use parking_lot::Mutex;
 use std::{
-    cell::Cell,
+    cell::{Cell, UnsafeCell},
    cmp,
    error::Error,
    fmt::{self, Display},
    ptr,
-    sync::atomic::{AtomicU64, Ordering},
 };

 /// Suballocators are used to divide a *region* into smaller *suballocations*.
@ -69,14 +67,6 @@ use std::{
 /// [`DeviceMemory`]: crate::memory::DeviceMemory
 /// [pages]: super#pages
 pub unsafe trait Suballocator {
-    /// Whether this allocator needs to block or not.
-    ///
-    /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
-    /// suballocator at compile time.
-    ///
-    /// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator
-    const IS_BLOCKING: bool;
-
    /// Whether the allocator needs [`cleanup`] to be called before memory can be released.
    ///
    /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
@ -280,13 +270,11 @@ impl Display for SuballocatorError {
 pub struct FreeListAllocator {
    region_offset: DeviceSize,
    // Total memory remaining in the region.
-    free_size: AtomicU64,
-    state: Mutex<FreeListAllocatorState>,
+    free_size: Cell<DeviceSize>,
+    state: UnsafeCell<FreeListAllocatorState>,
 }

 unsafe impl Suballocator for FreeListAllocator {
-    const IS_BLOCKING: bool = true;
-
    const NEEDS_CLEANUP: bool = false;

    /// Creates a new `FreeListAllocator` for the given [region].
@ -296,7 +284,7 @@ unsafe impl Suballocator for FreeListAllocator {
        // NOTE(Marc): This number was pulled straight out of my a-
        const AVERAGE_ALLOCATION_SIZE: DeviceSize = 64 * 1024;

-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);

        let capacity = (region_size / AVERAGE_ALLOCATION_SIZE) as usize;
        let mut nodes = host::PoolAllocator::new(capacity + 64);
@ -309,7 +297,7 @@ unsafe impl Suballocator for FreeListAllocator {
            ty: SuballocationType::Free,
        });
        free_list.push(root_id);
-        let state = Mutex::new(FreeListAllocatorState { nodes, free_list });
+        let state = UnsafeCell::new(FreeListAllocatorState { nodes, free_list });

        FreeListAllocator {
            region_offset,
@ -337,7 +325,7 @@ unsafe impl Suballocator for FreeListAllocator {

        let size = layout.size();
        let alignment = layout.alignment();
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };

        unsafe {
            match state.free_list.last() {
@ -392,7 +380,7 @@ unsafe impl Suballocator for FreeListAllocator {

                            // This can't overflow because suballocation sizes in the free-list are
                            // constrained by the remaining size of the region.
-                            self.free_size.fetch_sub(size, Ordering::Release);
+                            self.free_size.set(self.free_size.get() - size);

                            return Ok(Suballocation {
                                offset,
@ -421,14 +409,14 @@ unsafe impl Suballocator for FreeListAllocator {
        // allocation of `self`.
        let node_id = SlotId::new(suballocation.handle.0 as _);

-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
        let node = state.nodes.get_mut(node_id);

        debug_assert!(node.ty != SuballocationType::Free);

        // Suballocation sizes are constrained by the size of the region, so they can't possibly
        // overflow when added up.
-        self.free_size.fetch_add(node.size, Ordering::Release);
+        self.free_size.set(self.free_size.get() + node.size);

        node.ty = SuballocationType::Free;
        state.coalesce(node_id);
@ -437,7 +425,7 @@ unsafe impl Suballocator for FreeListAllocator {

    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
    }

    #[inline]
@ -748,8 +736,8 @@ impl FreeListAllocatorState {
 pub struct BuddyAllocator {
    region_offset: DeviceSize,
    // Total memory remaining in the region.
-    free_size: AtomicU64,
-    state: Mutex<BuddyAllocatorState>,
+    free_size: Cell<DeviceSize>,
+    state: UnsafeCell<BuddyAllocatorState>,
 }

 impl BuddyAllocator {
@ -761,8 +749,6 @@ impl BuddyAllocator {
 }

 unsafe impl Suballocator for BuddyAllocator {
-    const IS_BLOCKING: bool = true;
-
    const NEEDS_CLEANUP: bool = false;

    /// Creates a new `BuddyAllocator` for the given [region].
@ -783,13 +769,13 @@ unsafe impl Suballocator for BuddyAllocator {

        assert!(max_order < BuddyAllocator::MAX_ORDERS);

-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);

        let mut free_list =
            ArrayVec::new(max_order + 1, [EMPTY_FREE_LIST; BuddyAllocator::MAX_ORDERS]);
        // The root node has the lowest offset and highest order, so it's the whole region.
        free_list[max_order].push(region_offset);
-        let state = Mutex::new(BuddyAllocatorState { free_list });
+        let state = UnsafeCell::new(BuddyAllocatorState { free_list });

        BuddyAllocator {
            region_offset,
@ -840,7 +826,7 @@ unsafe impl Suballocator for BuddyAllocator {
        let size = cmp::max(size, BuddyAllocator::MIN_NODE_SIZE).next_power_of_two();

        let min_order = (size / BuddyAllocator::MIN_NODE_SIZE).trailing_zeros() as usize;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };

        // Start searching at the lowest possible order going up.
        for (order, free_list) in state.free_list.iter_mut().enumerate().skip(min_order) {
@ -875,7 +861,7 @@ unsafe impl Suballocator for BuddyAllocator {

                    // This can't overflow because suballocation sizes in the free-list are
                    // constrained by the remaining size of the region.
-                    self.free_size.fetch_sub(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() - size);

                    return Ok(Suballocation {
                        offset,
@ -900,7 +886,7 @@ unsafe impl Suballocator for BuddyAllocator {
        let order = suballocation.handle.0 as usize;

        let min_order = order;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };

        debug_assert!(!state.free_list[order].contains(&offset));

@ -930,7 +916,7 @@ unsafe impl Suballocator for BuddyAllocator {

                    // The sizes of suballocations allocated by `self` are constrained by that of
                    // its region, so they can't possibly overflow when added up.
-                    self.free_size.fetch_add(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() + size);

                    break;
                }
@ -945,7 +931,7 @@ unsafe impl Suballocator for BuddyAllocator {
    /// [internal fragmentation]: super#internal-fragmentation
    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
    }

    #[inline]
@ -1014,9 +1000,8 @@ struct BuddyAllocatorState {
 pub struct BumpAllocator {
    region_offset: DeviceSize,
    region_size: DeviceSize,
-    // Encodes the previous allocation type in the 2 least signifficant bits and the free start in
-    // the rest.
-    state: AtomicU64,
+    free_start: Cell<DeviceSize>,
+    prev_allocation_type: Cell<AllocationType>,
 }

 impl BumpAllocator {
@ -1025,29 +1010,23 @@ impl BumpAllocator {
    /// [region]: Suballocator#regions
    #[inline]
    pub fn reset(&mut self) {
-        *self.state.get_mut() = AllocationType::Unknown as DeviceSize;
+        *self.free_start.get_mut() = 0;
+        *self.prev_allocation_type.get_mut() = AllocationType::Unknown;
    }
 }

 unsafe impl Suballocator for BumpAllocator {
-    const IS_BLOCKING: bool = false;
-
    const NEEDS_CLEANUP: bool = true;

    /// Creates a new `BumpAllocator` for the given [region].
    ///
    /// [region]: Suballocator#regions
    fn new(region_offset: DeviceSize, region_size: DeviceSize) -> Self {
-        // Sanity check: this would lead to UB because of the left-shifting by 2 needed to encode
-        // the free-start into the state.
-        assert!(region_size <= (DeviceLayout::MAX_SIZE >> 2));
-
-        let state = AtomicU64::new(AllocationType::Unknown as DeviceSize);
-
        BumpAllocator {
            region_offset,
            region_size,
-            state,
+            free_start: Cell::new(0),
+            prev_allocation_type: Cell::new(AllocationType::Unknown),
        }
    }

@ -1058,97 +1037,42 @@ unsafe impl Suballocator for BumpAllocator {
        allocation_type: AllocationType,
        buffer_image_granularity: DeviceAlignment,
    ) -> Result<Suballocation, SuballocatorError> {
-        const SPIN_LIMIT: u32 = 6;
-
-        // NOTE(Marc): The following code is a minimal version `Backoff` taken from
-        // crossbeam_utils v0.8.11, because we didn't want to add a dependency for a couple lines
-        // that are used in one place only.
-        /// Original documentation:
-        /// https://docs.rs/crossbeam-utils/0.8.11/crossbeam_utils/struct.Backoff.html
-        struct Backoff {
-            step: Cell<u32>,
-        }
-
-        impl Backoff {
-            fn new() -> Self {
-                Backoff { step: Cell::new(0) }
-            }
-
-            fn spin(&self) {
-                for _ in 0..1 << self.step.get().min(SPIN_LIMIT) {
-                    core::hint::spin_loop();
-                }
-
-                if self.step.get() <= SPIN_LIMIT {
-                    self.step.set(self.step.get() + 1);
-                }
-            }
-        }
-
        fn has_granularity_conflict(prev_ty: AllocationType, ty: AllocationType) -> bool {
            prev_ty == AllocationType::Unknown || prev_ty != ty
        }

        let size = layout.size();
        let alignment = layout.alignment();
-        let backoff = Backoff::new();
-        let mut state = self.state.load(Ordering::Relaxed);

-        loop {
-            let free_start = state >> 2;
+        // These can't overflow because offsets are constrained by the size of the root
+        // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
+        let prev_end = self.region_offset + self.free_start.get();
+        let mut offset = align_up(prev_end, alignment);

-            // These can't overflow because offsets are constrained by the size of the root
-            // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
-            let prev_end = self.region_offset + free_start;
-            let mut offset = align_up(prev_end, alignment);
-
-            if buffer_image_granularity != DeviceAlignment::MIN {
-                let prev_alloc_type = match state & 0b11 {
-                    0 => AllocationType::Unknown,
-                    1 => AllocationType::Linear,
-                    2 => AllocationType::NonLinear,
-                    _ => unreachable!(),
-                };
-
-                if prev_end > 0
-                    && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
-                    && has_granularity_conflict(prev_alloc_type, allocation_type)
-                {
-                    offset = align_up(offset, buffer_image_granularity);
-                }
-            }
-
-            let relative_offset = offset - self.region_offset;
-
-            let free_start = relative_offset + size;
-
-            if free_start > self.region_size {
-                return Err(SuballocatorError::OutOfRegionMemory);
-            }
-
-            // This can't discard any bits because we checked that `region_size` does not exceed
-            // `DeviceLayout::MAX_SIZE >> 2`.
-            let new_state = free_start << 2 | allocation_type as DeviceSize;
-
-            match self.state.compare_exchange_weak(
-                state,
-                new_state,
-                Ordering::Release,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => {
-                    return Ok(Suballocation {
-                        offset,
-                        size,
-                        handle: AllocationHandle(ptr::null_mut()),
-                    });
-                }
-                Err(new_state) => {
-                    state = new_state;
-                    backoff.spin();
-                }
-            }
+        if buffer_image_granularity != DeviceAlignment::MIN
+            && prev_end > 0
+            && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
+            && has_granularity_conflict(self.prev_allocation_type.get(), allocation_type)
+        {
+            offset = align_up(offset, buffer_image_granularity);
        }
+
+        let relative_offset = offset - self.region_offset;
+
+        let free_start = relative_offset + size;
+
+        if free_start > self.region_size {
+            return Err(SuballocatorError::OutOfRegionMemory);
+        }
+
+        self.free_start.set(free_start);
+        self.prev_allocation_type.set(allocation_type);
+
+        Ok(Suballocation {
+            offset,
+            size,
+            handle: AllocationHandle(ptr::null_mut()),
+        })
    }

    #[inline]
@ -1158,7 +1082,7 @@ unsafe impl Suballocator for BumpAllocator {

    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.region_size - (self.state.load(Ordering::Acquire) >> 2)
+        self.region_size - self.free_start.get()
    }

    #[inline]
@ -1303,6 +1227,7 @@ mod host {
 mod tests {
    use super::*;
    use crossbeam_queue::ArrayQueue;
+    use parking_lot::Mutex;
    use std::thread;

    const fn unwrap<T: Copy>(opt: Option<T>) -> T {
@ -1322,7 +1247,7 @@ mod tests {
        const REGION_SIZE: DeviceSize =
            (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;

-        let allocator = FreeListAllocator::new(0, REGION_SIZE);
+        let allocator = Mutex::new(FreeListAllocator::new(0, REGION_SIZE));
        let allocs = ArrayQueue::new((ALLOCATIONS_PER_THREAD * THREADS) as usize);

        // Using threads to randomize allocation order.
@ -1337,6 +1262,7 @@ mod tests {
                        allocs
                            .push(
                                allocator
+                                    .lock()
                                    .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
                                    .unwrap(),
                            )
@ -1346,6 +1272,8 @@ mod tests {
            }
        });

+        let allocator = allocator.into_inner();
+
        assert!(allocator
            .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
            .is_err());
@ -1709,39 +1637,4 @@ mod tests {
        allocator.reset();
        assert!(allocator.free_size() == REGION_SIZE);
    }
-
-    #[test]
-    fn bump_allocator_syncness() {
-        const THREADS: DeviceSize = 12;
-        const ALLOCATIONS_PER_THREAD: DeviceSize = 100_000;
-        const ALLOCATION_STEP: DeviceSize = 117;
-        const REGION_SIZE: DeviceSize =
-            (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
-
-        let mut allocator = BumpAllocator::new(0, REGION_SIZE);
-
-        thread::scope(|scope| {
-            for i in 1..=THREADS {
-                let allocator = &allocator;
-
-                scope.spawn(move || {
-                    let layout = DeviceLayout::from_size_alignment(i * ALLOCATION_STEP, 1).unwrap();
-
-                    for _ in 0..ALLOCATIONS_PER_THREAD {
-                        allocator
-                            .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
-                            .unwrap();
-                    }
-                });
-            }
-        });
-
-        assert!(allocator
-            .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
-            .is_err());
-        assert!(allocator.free_size() == 0);
-
-        allocator.reset();
-        assert!(allocator.free_size() == REGION_SIZE);
-    }
 }