Make the suballocators !Sync (#2317)

2024-11-22 06:45:23 +00:00 · 2023-09-07 09:39:47 +02:00 · 2023-09-07 09:39:47 +02:00 · c93d71e064
commit c93d71e064
parent 64ea44c25e
2 changed files with 82 additions and 237 deletions
--- a/vulkano/src/memory/allocator/mod.rs
+++ b/vulkano/src/memory/allocator/mod.rs
@ -239,7 +239,7 @@ use crate::{
    VulkanError,
 };
 use ash::vk::{MAX_MEMORY_HEAPS, MAX_MEMORY_TYPES};
-use parking_lot::RwLock;
+use parking_lot::Mutex;
 use std::{
    error::Error,
    fmt::{Debug, Display, Error as FmtError, Formatter},
@ -878,7 +878,7 @@ pub struct GenericMemoryAllocator<S> {
 #[derive(Debug)]
 struct Pool<S> {
-    blocks: RwLock<Vec<Box<Block<S>>>>,
+    blocks: Mutex<Vec<Box<Block<S>>>>,
    // This is cached here for faster access, so we don't need to hop through 3 pointers.
    memory_type: ash::vk::MemoryType,
    atom_size: DeviceAlignment,
@ -888,7 +888,7 @@ impl<S> GenericMemoryAllocator<S> {
    // This is a false-positive, we only use this const for static initialization.
    #[allow(clippy::declare_interior_mutable_const)]
    const EMPTY_POOL: Pool<S> = Pool {
-        blocks: RwLock::new(Vec::new()),
+        blocks: Mutex::new(Vec::new()),
        memory_type: ash::vk::MemoryType {
            property_flags: ash::vk::MemoryPropertyFlags::empty(),
            heap_index: 0,
@ -1068,7 +1068,7 @@ impl<S> GenericMemoryAllocator<S> {
    }
 }
-unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
+unsafe impl<S: Suballocator + Send + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
    fn find_memory_type_index(
        &self,
        memory_type_bits: u32,
@ -1145,64 +1145,19 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
        layout = layout.align_to(pool.atom_size).unwrap();
-        let mut blocks = if S::IS_BLOCKING {
+        let mut blocks = pool.blocks.lock();
            // If the allocation algorithm needs to block, then there's no point in trying to avoid
            // locks here either. In that case the best strategy is to take full advantage of it by
            // always taking an exclusive lock, which lets us sort the blocks by free size. If you
            // as a user want to avoid locks, simply don't share the allocator between threads. You
            // can create as many allocators as you wish, but keep in mind that that will waste a
            // huge amount of memory unless you configure your block sizes properly!
-            let mut blocks = pool.blocks.write();
+        // TODO: Incremental sorting
-            blocks.sort_by_key(|block| block.free_size());
+        blocks.sort_by_key(|block| block.free_size());
-            let (Ok(idx) | Err(idx)) =
+        let (Ok(idx) | Err(idx)) = blocks.binary_search_by_key(&size, |block| block.free_size());
                blocks.binary_search_by_key(&size, |block| block.free_size());
-            for block in &blocks[idx..] {
+        for block in &blocks[idx..] {
-                if let Ok(allocation) =
+            if let Ok(allocation) =
-                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
+                block.allocate(layout, allocation_type, self.buffer_image_granularity)
-                {
+            {
-                    return Ok(allocation);
+                return Ok(allocation);
                }
            }
-
+        }
            blocks
        } else {
            // If the allocation algorithm is lock-free, then we should avoid taking an exclusive
            // lock unless it is absolutely neccessary (meaning, only when allocating a new
            // `DeviceMemory` block and inserting it into a pool). This has the disadvantage that
            // traversing the pool is O(n), which is not a problem since the number of blocks is
            // expected to be small. If there are more than 10 blocks in a pool then that's a
            // configuration error. Also, sorting the blocks before each allocation would be less
            // efficient because to get the free size of the `PoolAllocator` and `BumpAllocator`
            // has the same performance as trying to allocate.
            let blocks = pool.blocks.read();
            // Search in reverse order because we always append new blocks at the end.
            for block in blocks.iter().rev() {
                if let Ok(allocation) =
                    block.allocate(layout, allocation_type, self.buffer_image_granularity)
                {
                    return Ok(allocation);
                }
            }
            let len = blocks.len();
            drop(blocks);
            let blocks = pool.blocks.write();
            if blocks.len() > len {
                // Another thread beat us to it and inserted a fresh block, try to suballocate it.
                if let Ok(allocation) =
                    blocks[len].allocate(layout, allocation_type, self.buffer_image_granularity)
                {
                    return Ok(allocation);
                }
            }
            blocks
        };
        // For bump allocators, first do a garbage sweep and try to allocate again.
        if S::NEEDS_CLEANUP {
@ -1484,33 +1439,30 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
    unsafe fn deallocate(&self, allocation: MemoryAlloc) {
        if let Some(suballocation) = allocation.suballocation {
            let memory_type_index = allocation.device_memory.memory_type_index();
            let pool = self.pools[memory_type_index as usize].blocks.lock();
            let block_ptr = allocation.allocation_handle.0 as *const Block<S>;
            // TODO: Maybe do a similar check for dedicated blocks.
-            #[cfg(debug_assertions)]
+            debug_assert!(
-            {
+                pool.iter()
-                let memory_type_index = allocation.device_memory.memory_type_index();
+                    .any(|block| &**block as *const Block<S> == block_ptr),
-                let pool = self.pools[memory_type_index as usize].blocks.read();
+                "attempted to deallocate a memory block that does not belong to this allocator",
-
+            );
                assert!(
                    pool.iter()
                        .any(|block| &**block as *const Block<S> == block_ptr),
                    "attempted to deallocate a memory block that does not belong to this allocator",
                );
            }
            // SAFETY: The caller must guarantee that `allocation` refers to one allocated by
            // `self`, therefore `block_ptr` must be the same one we gave out on allocation. We
            // know that this pointer must be valid, because all blocks are boxed and pinned in
            // memory and because a block isn't dropped until the allocator itself is dropped, at
            // which point it would be impossible to call this method. We also know that it must be
-            // valid to create a reference to the block, because we only ever access it via shared
+            // valid to create a reference to the block, because we locked the pool it belongs to.
            // references.
            let block = &*block_ptr;
            // SAFETY: The caller must guarantee that `allocation` refers to a currently allocated
            // allocation of `self`.
            block.deallocate(suballocation);
            drop(pool);
        }
    }
 }
--- a/vulkano/src/memory/allocator/suballocator.rs
+++ b/vulkano/src/memory/allocator/suballocator.rs
@ -18,14 +18,12 @@ use super::{
    align_down, align_up, array_vec::ArrayVec, AllocationHandle, DeviceAlignment, DeviceLayout,
 };
 use crate::{image::ImageTiling, memory::is_aligned, DeviceSize, NonZeroDeviceSize};
 use parking_lot::Mutex;
 use std::{
-    cell::Cell,
+    cell::{Cell, UnsafeCell},
    cmp,
    error::Error,
    fmt::{self, Display},
    ptr,
    sync::atomic::{AtomicU64, Ordering},
 };
 /// Suballocators are used to divide a *region* into smaller *suballocations*.
@ -69,14 +67,6 @@ use std::{
 /// [`DeviceMemory`]: crate::memory::DeviceMemory
 /// [pages]: super#pages
 pub unsafe trait Suballocator {
    /// Whether this allocator needs to block or not.
    ///
    /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
    /// suballocator at compile time.
    ///
    /// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator
    const IS_BLOCKING: bool;
    /// Whether the allocator needs [`cleanup`] to be called before memory can be released.
    ///
    /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
@ -280,13 +270,11 @@ impl Display for SuballocatorError {
 pub struct FreeListAllocator {
    region_offset: DeviceSize,
    // Total memory remaining in the region.
-    free_size: AtomicU64,
+    free_size: Cell<DeviceSize>,
-    state: Mutex<FreeListAllocatorState>,
+    state: UnsafeCell<FreeListAllocatorState>,
 }
 unsafe impl Suballocator for FreeListAllocator {
    const IS_BLOCKING: bool = true;
    const NEEDS_CLEANUP: bool = false;
    /// Creates a new `FreeListAllocator` for the given [region].
@ -296,7 +284,7 @@ unsafe impl Suballocator for FreeListAllocator {
        // NOTE(Marc): This number was pulled straight out of my a-
        const AVERAGE_ALLOCATION_SIZE: DeviceSize = 64 * 1024;
-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);
        let capacity = (region_size / AVERAGE_ALLOCATION_SIZE) as usize;
        let mut nodes = host::PoolAllocator::new(capacity + 64);
@ -309,7 +297,7 @@ unsafe impl Suballocator for FreeListAllocator {
            ty: SuballocationType::Free,
        });
        free_list.push(root_id);
-        let state = Mutex::new(FreeListAllocatorState { nodes, free_list });
+        let state = UnsafeCell::new(FreeListAllocatorState { nodes, free_list });
        FreeListAllocator {
            region_offset,
@ -337,7 +325,7 @@ unsafe impl Suballocator for FreeListAllocator {
        let size = layout.size();
        let alignment = layout.alignment();
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
        unsafe {
            match state.free_list.last() {
@ -392,7 +380,7 @@ unsafe impl Suballocator for FreeListAllocator {
                            // This can't overflow because suballocation sizes in the free-list are
                            // constrained by the remaining size of the region.
-                            self.free_size.fetch_sub(size, Ordering::Release);
+                            self.free_size.set(self.free_size.get() - size);
                            return Ok(Suballocation {
                                offset,
@ -421,14 +409,14 @@ unsafe impl Suballocator for FreeListAllocator {
        // allocation of `self`.
        let node_id = SlotId::new(suballocation.handle.0 as _);
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
        let node = state.nodes.get_mut(node_id);
        debug_assert!(node.ty != SuballocationType::Free);
        // Suballocation sizes are constrained by the size of the region, so they can't possibly
        // overflow when added up.
-        self.free_size.fetch_add(node.size, Ordering::Release);
+        self.free_size.set(self.free_size.get() + node.size);
        node.ty = SuballocationType::Free;
        state.coalesce(node_id);
@ -437,7 +425,7 @@ unsafe impl Suballocator for FreeListAllocator {
    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
    }
    #[inline]
@ -748,8 +736,8 @@ impl FreeListAllocatorState {
 pub struct BuddyAllocator {
    region_offset: DeviceSize,
    // Total memory remaining in the region.
-    free_size: AtomicU64,
+    free_size: Cell<DeviceSize>,
-    state: Mutex<BuddyAllocatorState>,
+    state: UnsafeCell<BuddyAllocatorState>,
 }
 impl BuddyAllocator {
@ -761,8 +749,6 @@ impl BuddyAllocator {
 }
 unsafe impl Suballocator for BuddyAllocator {
    const IS_BLOCKING: bool = true;
    const NEEDS_CLEANUP: bool = false;
    /// Creates a new `BuddyAllocator` for the given [region].
@ -783,13 +769,13 @@ unsafe impl Suballocator for BuddyAllocator {
        assert!(max_order < BuddyAllocator::MAX_ORDERS);
-        let free_size = AtomicU64::new(region_size);
+        let free_size = Cell::new(region_size);
        let mut free_list =
            ArrayVec::new(max_order + 1, [EMPTY_FREE_LIST; BuddyAllocator::MAX_ORDERS]);
        // The root node has the lowest offset and highest order, so it's the whole region.
        free_list[max_order].push(region_offset);
-        let state = Mutex::new(BuddyAllocatorState { free_list });
+        let state = UnsafeCell::new(BuddyAllocatorState { free_list });
        BuddyAllocator {
            region_offset,
@ -840,7 +826,7 @@ unsafe impl Suballocator for BuddyAllocator {
        let size = cmp::max(size, BuddyAllocator::MIN_NODE_SIZE).next_power_of_two();
        let min_order = (size / BuddyAllocator::MIN_NODE_SIZE).trailing_zeros() as usize;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
        // Start searching at the lowest possible order going up.
        for (order, free_list) in state.free_list.iter_mut().enumerate().skip(min_order) {
@ -875,7 +861,7 @@ unsafe impl Suballocator for BuddyAllocator {
                    // This can't overflow because suballocation sizes in the free-list are
                    // constrained by the remaining size of the region.
-                    self.free_size.fetch_sub(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() - size);
                    return Ok(Suballocation {
                        offset,
@ -900,7 +886,7 @@ unsafe impl Suballocator for BuddyAllocator {
        let order = suballocation.handle.0 as usize;
        let min_order = order;
-        let mut state = self.state.lock();
+        let state = unsafe { &mut *self.state.get() };
        debug_assert!(!state.free_list[order].contains(&offset));
@ -930,7 +916,7 @@ unsafe impl Suballocator for BuddyAllocator {
                    // The sizes of suballocations allocated by `self` are constrained by that of
                    // its region, so they can't possibly overflow when added up.
-                    self.free_size.fetch_add(size, Ordering::Release);
+                    self.free_size.set(self.free_size.get() + size);
                    break;
                }
@ -945,7 +931,7 @@ unsafe impl Suballocator for BuddyAllocator {
    /// [internal fragmentation]: super#internal-fragmentation
    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.free_size.load(Ordering::Acquire)
+        self.free_size.get()
    }
    #[inline]
@ -1014,9 +1000,8 @@ struct BuddyAllocatorState {
 pub struct BumpAllocator {
    region_offset: DeviceSize,
    region_size: DeviceSize,
-    // Encodes the previous allocation type in the 2 least signifficant bits and the free start in
+    free_start: Cell<DeviceSize>,
-    // the rest.
+    prev_allocation_type: Cell<AllocationType>,
    state: AtomicU64,
 }
 impl BumpAllocator {
@ -1025,29 +1010,23 @@ impl BumpAllocator {
    /// [region]: Suballocator#regions
    #[inline]
    pub fn reset(&mut self) {
-        *self.state.get_mut() = AllocationType::Unknown as DeviceSize;
+        *self.free_start.get_mut() = 0;
        *self.prev_allocation_type.get_mut() = AllocationType::Unknown;
    }
 }
 unsafe impl Suballocator for BumpAllocator {
    const IS_BLOCKING: bool = false;
    const NEEDS_CLEANUP: bool = true;
    /// Creates a new `BumpAllocator` for the given [region].
    ///
    /// [region]: Suballocator#regions
    fn new(region_offset: DeviceSize, region_size: DeviceSize) -> Self {
        // Sanity check: this would lead to UB because of the left-shifting by 2 needed to encode
        // the free-start into the state.
        assert!(region_size <= (DeviceLayout::MAX_SIZE >> 2));
        let state = AtomicU64::new(AllocationType::Unknown as DeviceSize);
        BumpAllocator {
            region_offset,
            region_size,
-            state,
+            free_start: Cell::new(0),
            prev_allocation_type: Cell::new(AllocationType::Unknown),
        }
    }
@ -1058,97 +1037,42 @@ unsafe impl Suballocator for BumpAllocator {
        allocation_type: AllocationType,
        buffer_image_granularity: DeviceAlignment,
    ) -> Result<Suballocation, SuballocatorError> {
        const SPIN_LIMIT: u32 = 6;
        // NOTE(Marc): The following code is a minimal version `Backoff` taken from
        // crossbeam_utils v0.8.11, because we didn't want to add a dependency for a couple lines
        // that are used in one place only.
        /// Original documentation:
        /// https://docs.rs/crossbeam-utils/0.8.11/crossbeam_utils/struct.Backoff.html
        struct Backoff {
            step: Cell<u32>,
        }
        impl Backoff {
            fn new() -> Self {
                Backoff { step: Cell::new(0) }
            }
            fn spin(&self) {
                for _ in 0..1 << self.step.get().min(SPIN_LIMIT) {
                    core::hint::spin_loop();
                }
                if self.step.get() <= SPIN_LIMIT {
                    self.step.set(self.step.get() + 1);
                }
            }
        }
        fn has_granularity_conflict(prev_ty: AllocationType, ty: AllocationType) -> bool {
            prev_ty == AllocationType::Unknown || prev_ty != ty
        }
        let size = layout.size();
        let alignment = layout.alignment();
        let backoff = Backoff::new();
        let mut state = self.state.load(Ordering::Relaxed);
-        loop {
+        // These can't overflow because offsets are constrained by the size of the root
-            let free_start = state >> 2;
+        // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
        let prev_end = self.region_offset + self.free_start.get();
        let mut offset = align_up(prev_end, alignment);
-            // These can't overflow because offsets are constrained by the size of the root
+        if buffer_image_granularity != DeviceAlignment::MIN
-            // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
+            && prev_end > 0
-            let prev_end = self.region_offset + free_start;
+            && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
-            let mut offset = align_up(prev_end, alignment);
+            && has_granularity_conflict(self.prev_allocation_type.get(), allocation_type)
-
+        {
-            if buffer_image_granularity != DeviceAlignment::MIN {
+            offset = align_up(offset, buffer_image_granularity);
                let prev_alloc_type = match state & 0b11 {
                    0 => AllocationType::Unknown,
                    1 => AllocationType::Linear,
                    2 => AllocationType::NonLinear,
                    _ => unreachable!(),
                };
                if prev_end > 0
                    && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
                    && has_granularity_conflict(prev_alloc_type, allocation_type)
                {
                    offset = align_up(offset, buffer_image_granularity);
                }
            }
            let relative_offset = offset - self.region_offset;
            let free_start = relative_offset + size;
            if free_start > self.region_size {
                return Err(SuballocatorError::OutOfRegionMemory);
            }
            // This can't discard any bits because we checked that `region_size` does not exceed
            // `DeviceLayout::MAX_SIZE >> 2`.
            let new_state = free_start << 2 | allocation_type as DeviceSize;
            match self.state.compare_exchange_weak(
                state,
                new_state,
                Ordering::Release,
                Ordering::Relaxed,
            ) {
                Ok(_) => {
                    return Ok(Suballocation {
                        offset,
                        size,
                        handle: AllocationHandle(ptr::null_mut()),
                    });
                }
                Err(new_state) => {
                    state = new_state;
                    backoff.spin();
                }
            }
        }
        let relative_offset = offset - self.region_offset;
        let free_start = relative_offset + size;
        if free_start > self.region_size {
            return Err(SuballocatorError::OutOfRegionMemory);
        }
        self.free_start.set(free_start);
        self.prev_allocation_type.set(allocation_type);
        Ok(Suballocation {
            offset,
            size,
            handle: AllocationHandle(ptr::null_mut()),
        })
    }
    #[inline]
@ -1158,7 +1082,7 @@ unsafe impl Suballocator for BumpAllocator {
    #[inline]
    fn free_size(&self) -> DeviceSize {
-        self.region_size - (self.state.load(Ordering::Acquire) >> 2)
+        self.region_size - self.free_start.get()
    }
    #[inline]
@ -1303,6 +1227,7 @@ mod host {
 mod tests {
    use super::*;
    use crossbeam_queue::ArrayQueue;
    use parking_lot::Mutex;
    use std::thread;
    const fn unwrap<T: Copy>(opt: Option<T>) -> T {
@ -1322,7 +1247,7 @@ mod tests {
        const REGION_SIZE: DeviceSize =
            (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
-        let allocator = FreeListAllocator::new(0, REGION_SIZE);
+        let allocator = Mutex::new(FreeListAllocator::new(0, REGION_SIZE));
        let allocs = ArrayQueue::new((ALLOCATIONS_PER_THREAD * THREADS) as usize);
        // Using threads to randomize allocation order.
@ -1337,6 +1262,7 @@ mod tests {
                        allocs
                            .push(
                                allocator
                                    .lock()
                                    .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
                                    .unwrap(),
                            )
@ -1346,6 +1272,8 @@ mod tests {
            }
        });
        let allocator = allocator.into_inner();
        assert!(allocator
            .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
            .is_err());
@ -1709,39 +1637,4 @@ mod tests {
        allocator.reset();
        assert!(allocator.free_size() == REGION_SIZE);
    }
    #[test]
    fn bump_allocator_syncness() {
        const THREADS: DeviceSize = 12;
        const ALLOCATIONS_PER_THREAD: DeviceSize = 100_000;
        const ALLOCATION_STEP: DeviceSize = 117;
        const REGION_SIZE: DeviceSize =
            (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
        let mut allocator = BumpAllocator::new(0, REGION_SIZE);
        thread::scope(|scope| {
            for i in 1..=THREADS {
                let allocator = &allocator;
                scope.spawn(move || {
                    let layout = DeviceLayout::from_size_alignment(i * ALLOCATION_STEP, 1).unwrap();
                    for _ in 0..ALLOCATIONS_PER_THREAD {
                        allocator
                            .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
                            .unwrap();
                    }
                });
            }
        });
        assert!(allocator
            .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
            .is_err());
        assert!(allocator.free_size() == 0);
        allocator.reset();
        assert!(allocator.free_size() == REGION_SIZE);
    }
 }