From c93d71e06485b93144b52db6af861c29adebb029 Mon Sep 17 00:00:00 2001 From: marc0246 <40955683+marc0246@users.noreply.github.com> Date: Thu, 7 Sep 2023 09:39:47 +0200 Subject: [PATCH] Make the suballocators `!Sync` (#2317) --- vulkano/src/memory/allocator/mod.rs | 96 ++------ vulkano/src/memory/allocator/suballocator.rs | 223 +++++-------------- 2 files changed, 82 insertions(+), 237 deletions(-) diff --git a/vulkano/src/memory/allocator/mod.rs b/vulkano/src/memory/allocator/mod.rs index 65ced9ea..ae09ed2b 100644 --- a/vulkano/src/memory/allocator/mod.rs +++ b/vulkano/src/memory/allocator/mod.rs @@ -239,7 +239,7 @@ use crate::{ VulkanError, }; use ash::vk::{MAX_MEMORY_HEAPS, MAX_MEMORY_TYPES}; -use parking_lot::RwLock; +use parking_lot::Mutex; use std::{ error::Error, fmt::{Debug, Display, Error as FmtError, Formatter}, @@ -878,7 +878,7 @@ pub struct GenericMemoryAllocator { #[derive(Debug)] struct Pool { - blocks: RwLock>>>, + blocks: Mutex>>>, // This is cached here for faster access, so we don't need to hop through 3 pointers. memory_type: ash::vk::MemoryType, atom_size: DeviceAlignment, @@ -888,7 +888,7 @@ impl GenericMemoryAllocator { // This is a false-positive, we only use this const for static initialization. #[allow(clippy::declare_interior_mutable_const)] const EMPTY_POOL: Pool = Pool { - blocks: RwLock::new(Vec::new()), + blocks: Mutex::new(Vec::new()), memory_type: ash::vk::MemoryType { property_flags: ash::vk::MemoryPropertyFlags::empty(), heap_index: 0, @@ -1068,7 +1068,7 @@ impl GenericMemoryAllocator { } } -unsafe impl MemoryAllocator for GenericMemoryAllocator { +unsafe impl MemoryAllocator for GenericMemoryAllocator { fn find_memory_type_index( &self, memory_type_bits: u32, @@ -1145,64 +1145,19 @@ unsafe impl MemoryAllocator for Generic layout = layout.align_to(pool.atom_size).unwrap(); - let mut blocks = if S::IS_BLOCKING { - // If the allocation algorithm needs to block, then there's no point in trying to avoid - // locks here either. In that case the best strategy is to take full advantage of it by - // always taking an exclusive lock, which lets us sort the blocks by free size. If you - // as a user want to avoid locks, simply don't share the allocator between threads. You - // can create as many allocators as you wish, but keep in mind that that will waste a - // huge amount of memory unless you configure your block sizes properly! + let mut blocks = pool.blocks.lock(); - let mut blocks = pool.blocks.write(); - blocks.sort_by_key(|block| block.free_size()); - let (Ok(idx) | Err(idx)) = - blocks.binary_search_by_key(&size, |block| block.free_size()); + // TODO: Incremental sorting + blocks.sort_by_key(|block| block.free_size()); + let (Ok(idx) | Err(idx)) = blocks.binary_search_by_key(&size, |block| block.free_size()); - for block in &blocks[idx..] { - if let Ok(allocation) = - block.allocate(layout, allocation_type, self.buffer_image_granularity) - { - return Ok(allocation); - } + for block in &blocks[idx..] { + if let Ok(allocation) = + block.allocate(layout, allocation_type, self.buffer_image_granularity) + { + return Ok(allocation); } - - blocks - } else { - // If the allocation algorithm is lock-free, then we should avoid taking an exclusive - // lock unless it is absolutely neccessary (meaning, only when allocating a new - // `DeviceMemory` block and inserting it into a pool). This has the disadvantage that - // traversing the pool is O(n), which is not a problem since the number of blocks is - // expected to be small. If there are more than 10 blocks in a pool then that's a - // configuration error. Also, sorting the blocks before each allocation would be less - // efficient because to get the free size of the `PoolAllocator` and `BumpAllocator` - // has the same performance as trying to allocate. - - let blocks = pool.blocks.read(); - - // Search in reverse order because we always append new blocks at the end. - for block in blocks.iter().rev() { - if let Ok(allocation) = - block.allocate(layout, allocation_type, self.buffer_image_granularity) - { - return Ok(allocation); - } - } - - let len = blocks.len(); - drop(blocks); - let blocks = pool.blocks.write(); - - if blocks.len() > len { - // Another thread beat us to it and inserted a fresh block, try to suballocate it. - if let Ok(allocation) = - blocks[len].allocate(layout, allocation_type, self.buffer_image_granularity) - { - return Ok(allocation); - } - } - - blocks - }; + } // For bump allocators, first do a garbage sweep and try to allocate again. if S::NEEDS_CLEANUP { @@ -1484,33 +1439,30 @@ unsafe impl MemoryAllocator for Generic unsafe fn deallocate(&self, allocation: MemoryAlloc) { if let Some(suballocation) = allocation.suballocation { + let memory_type_index = allocation.device_memory.memory_type_index(); + let pool = self.pools[memory_type_index as usize].blocks.lock(); let block_ptr = allocation.allocation_handle.0 as *const Block; // TODO: Maybe do a similar check for dedicated blocks. - #[cfg(debug_assertions)] - { - let memory_type_index = allocation.device_memory.memory_type_index(); - let pool = self.pools[memory_type_index as usize].blocks.read(); - - assert!( - pool.iter() - .any(|block| &**block as *const Block == block_ptr), - "attempted to deallocate a memory block that does not belong to this allocator", - ); - } + debug_assert!( + pool.iter() + .any(|block| &**block as *const Block == block_ptr), + "attempted to deallocate a memory block that does not belong to this allocator", + ); // SAFETY: The caller must guarantee that `allocation` refers to one allocated by // `self`, therefore `block_ptr` must be the same one we gave out on allocation. We // know that this pointer must be valid, because all blocks are boxed and pinned in // memory and because a block isn't dropped until the allocator itself is dropped, at // which point it would be impossible to call this method. We also know that it must be - // valid to create a reference to the block, because we only ever access it via shared - // references. + // valid to create a reference to the block, because we locked the pool it belongs to. let block = &*block_ptr; // SAFETY: The caller must guarantee that `allocation` refers to a currently allocated // allocation of `self`. block.deallocate(suballocation); + + drop(pool); } } } diff --git a/vulkano/src/memory/allocator/suballocator.rs b/vulkano/src/memory/allocator/suballocator.rs index bb922ba4..4400a3a5 100644 --- a/vulkano/src/memory/allocator/suballocator.rs +++ b/vulkano/src/memory/allocator/suballocator.rs @@ -18,14 +18,12 @@ use super::{ align_down, align_up, array_vec::ArrayVec, AllocationHandle, DeviceAlignment, DeviceLayout, }; use crate::{image::ImageTiling, memory::is_aligned, DeviceSize, NonZeroDeviceSize}; -use parking_lot::Mutex; use std::{ - cell::Cell, + cell::{Cell, UnsafeCell}, cmp, error::Error, fmt::{self, Display}, ptr, - sync::atomic::{AtomicU64, Ordering}, }; /// Suballocators are used to divide a *region* into smaller *suballocations*. @@ -69,14 +67,6 @@ use std::{ /// [`DeviceMemory`]: crate::memory::DeviceMemory /// [pages]: super#pages pub unsafe trait Suballocator { - /// Whether this allocator needs to block or not. - /// - /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the - /// suballocator at compile time. - /// - /// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator - const IS_BLOCKING: bool; - /// Whether the allocator needs [`cleanup`] to be called before memory can be released. /// /// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the @@ -280,13 +270,11 @@ impl Display for SuballocatorError { pub struct FreeListAllocator { region_offset: DeviceSize, // Total memory remaining in the region. - free_size: AtomicU64, - state: Mutex, + free_size: Cell, + state: UnsafeCell, } unsafe impl Suballocator for FreeListAllocator { - const IS_BLOCKING: bool = true; - const NEEDS_CLEANUP: bool = false; /// Creates a new `FreeListAllocator` for the given [region]. @@ -296,7 +284,7 @@ unsafe impl Suballocator for FreeListAllocator { // NOTE(Marc): This number was pulled straight out of my a- const AVERAGE_ALLOCATION_SIZE: DeviceSize = 64 * 1024; - let free_size = AtomicU64::new(region_size); + let free_size = Cell::new(region_size); let capacity = (region_size / AVERAGE_ALLOCATION_SIZE) as usize; let mut nodes = host::PoolAllocator::new(capacity + 64); @@ -309,7 +297,7 @@ unsafe impl Suballocator for FreeListAllocator { ty: SuballocationType::Free, }); free_list.push(root_id); - let state = Mutex::new(FreeListAllocatorState { nodes, free_list }); + let state = UnsafeCell::new(FreeListAllocatorState { nodes, free_list }); FreeListAllocator { region_offset, @@ -337,7 +325,7 @@ unsafe impl Suballocator for FreeListAllocator { let size = layout.size(); let alignment = layout.alignment(); - let mut state = self.state.lock(); + let state = unsafe { &mut *self.state.get() }; unsafe { match state.free_list.last() { @@ -392,7 +380,7 @@ unsafe impl Suballocator for FreeListAllocator { // This can't overflow because suballocation sizes in the free-list are // constrained by the remaining size of the region. - self.free_size.fetch_sub(size, Ordering::Release); + self.free_size.set(self.free_size.get() - size); return Ok(Suballocation { offset, @@ -421,14 +409,14 @@ unsafe impl Suballocator for FreeListAllocator { // allocation of `self`. let node_id = SlotId::new(suballocation.handle.0 as _); - let mut state = self.state.lock(); + let state = unsafe { &mut *self.state.get() }; let node = state.nodes.get_mut(node_id); debug_assert!(node.ty != SuballocationType::Free); // Suballocation sizes are constrained by the size of the region, so they can't possibly // overflow when added up. - self.free_size.fetch_add(node.size, Ordering::Release); + self.free_size.set(self.free_size.get() + node.size); node.ty = SuballocationType::Free; state.coalesce(node_id); @@ -437,7 +425,7 @@ unsafe impl Suballocator for FreeListAllocator { #[inline] fn free_size(&self) -> DeviceSize { - self.free_size.load(Ordering::Acquire) + self.free_size.get() } #[inline] @@ -748,8 +736,8 @@ impl FreeListAllocatorState { pub struct BuddyAllocator { region_offset: DeviceSize, // Total memory remaining in the region. - free_size: AtomicU64, - state: Mutex, + free_size: Cell, + state: UnsafeCell, } impl BuddyAllocator { @@ -761,8 +749,6 @@ impl BuddyAllocator { } unsafe impl Suballocator for BuddyAllocator { - const IS_BLOCKING: bool = true; - const NEEDS_CLEANUP: bool = false; /// Creates a new `BuddyAllocator` for the given [region]. @@ -783,13 +769,13 @@ unsafe impl Suballocator for BuddyAllocator { assert!(max_order < BuddyAllocator::MAX_ORDERS); - let free_size = AtomicU64::new(region_size); + let free_size = Cell::new(region_size); let mut free_list = ArrayVec::new(max_order + 1, [EMPTY_FREE_LIST; BuddyAllocator::MAX_ORDERS]); // The root node has the lowest offset and highest order, so it's the whole region. free_list[max_order].push(region_offset); - let state = Mutex::new(BuddyAllocatorState { free_list }); + let state = UnsafeCell::new(BuddyAllocatorState { free_list }); BuddyAllocator { region_offset, @@ -840,7 +826,7 @@ unsafe impl Suballocator for BuddyAllocator { let size = cmp::max(size, BuddyAllocator::MIN_NODE_SIZE).next_power_of_two(); let min_order = (size / BuddyAllocator::MIN_NODE_SIZE).trailing_zeros() as usize; - let mut state = self.state.lock(); + let state = unsafe { &mut *self.state.get() }; // Start searching at the lowest possible order going up. for (order, free_list) in state.free_list.iter_mut().enumerate().skip(min_order) { @@ -875,7 +861,7 @@ unsafe impl Suballocator for BuddyAllocator { // This can't overflow because suballocation sizes in the free-list are // constrained by the remaining size of the region. - self.free_size.fetch_sub(size, Ordering::Release); + self.free_size.set(self.free_size.get() - size); return Ok(Suballocation { offset, @@ -900,7 +886,7 @@ unsafe impl Suballocator for BuddyAllocator { let order = suballocation.handle.0 as usize; let min_order = order; - let mut state = self.state.lock(); + let state = unsafe { &mut *self.state.get() }; debug_assert!(!state.free_list[order].contains(&offset)); @@ -930,7 +916,7 @@ unsafe impl Suballocator for BuddyAllocator { // The sizes of suballocations allocated by `self` are constrained by that of // its region, so they can't possibly overflow when added up. - self.free_size.fetch_add(size, Ordering::Release); + self.free_size.set(self.free_size.get() + size); break; } @@ -945,7 +931,7 @@ unsafe impl Suballocator for BuddyAllocator { /// [internal fragmentation]: super#internal-fragmentation #[inline] fn free_size(&self) -> DeviceSize { - self.free_size.load(Ordering::Acquire) + self.free_size.get() } #[inline] @@ -1014,9 +1000,8 @@ struct BuddyAllocatorState { pub struct BumpAllocator { region_offset: DeviceSize, region_size: DeviceSize, - // Encodes the previous allocation type in the 2 least signifficant bits and the free start in - // the rest. - state: AtomicU64, + free_start: Cell, + prev_allocation_type: Cell, } impl BumpAllocator { @@ -1025,29 +1010,23 @@ impl BumpAllocator { /// [region]: Suballocator#regions #[inline] pub fn reset(&mut self) { - *self.state.get_mut() = AllocationType::Unknown as DeviceSize; + *self.free_start.get_mut() = 0; + *self.prev_allocation_type.get_mut() = AllocationType::Unknown; } } unsafe impl Suballocator for BumpAllocator { - const IS_BLOCKING: bool = false; - const NEEDS_CLEANUP: bool = true; /// Creates a new `BumpAllocator` for the given [region]. /// /// [region]: Suballocator#regions fn new(region_offset: DeviceSize, region_size: DeviceSize) -> Self { - // Sanity check: this would lead to UB because of the left-shifting by 2 needed to encode - // the free-start into the state. - assert!(region_size <= (DeviceLayout::MAX_SIZE >> 2)); - - let state = AtomicU64::new(AllocationType::Unknown as DeviceSize); - BumpAllocator { region_offset, region_size, - state, + free_start: Cell::new(0), + prev_allocation_type: Cell::new(AllocationType::Unknown), } } @@ -1058,97 +1037,42 @@ unsafe impl Suballocator for BumpAllocator { allocation_type: AllocationType, buffer_image_granularity: DeviceAlignment, ) -> Result { - const SPIN_LIMIT: u32 = 6; - - // NOTE(Marc): The following code is a minimal version `Backoff` taken from - // crossbeam_utils v0.8.11, because we didn't want to add a dependency for a couple lines - // that are used in one place only. - /// Original documentation: - /// https://docs.rs/crossbeam-utils/0.8.11/crossbeam_utils/struct.Backoff.html - struct Backoff { - step: Cell, - } - - impl Backoff { - fn new() -> Self { - Backoff { step: Cell::new(0) } - } - - fn spin(&self) { - for _ in 0..1 << self.step.get().min(SPIN_LIMIT) { - core::hint::spin_loop(); - } - - if self.step.get() <= SPIN_LIMIT { - self.step.set(self.step.get() + 1); - } - } - } - fn has_granularity_conflict(prev_ty: AllocationType, ty: AllocationType) -> bool { prev_ty == AllocationType::Unknown || prev_ty != ty } let size = layout.size(); let alignment = layout.alignment(); - let backoff = Backoff::new(); - let mut state = self.state.load(Ordering::Relaxed); - loop { - let free_start = state >> 2; + // These can't overflow because offsets are constrained by the size of the root + // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`. + let prev_end = self.region_offset + self.free_start.get(); + let mut offset = align_up(prev_end, alignment); - // These can't overflow because offsets are constrained by the size of the root - // allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`. - let prev_end = self.region_offset + free_start; - let mut offset = align_up(prev_end, alignment); - - if buffer_image_granularity != DeviceAlignment::MIN { - let prev_alloc_type = match state & 0b11 { - 0 => AllocationType::Unknown, - 1 => AllocationType::Linear, - 2 => AllocationType::NonLinear, - _ => unreachable!(), - }; - - if prev_end > 0 - && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity) - && has_granularity_conflict(prev_alloc_type, allocation_type) - { - offset = align_up(offset, buffer_image_granularity); - } - } - - let relative_offset = offset - self.region_offset; - - let free_start = relative_offset + size; - - if free_start > self.region_size { - return Err(SuballocatorError::OutOfRegionMemory); - } - - // This can't discard any bits because we checked that `region_size` does not exceed - // `DeviceLayout::MAX_SIZE >> 2`. - let new_state = free_start << 2 | allocation_type as DeviceSize; - - match self.state.compare_exchange_weak( - state, - new_state, - Ordering::Release, - Ordering::Relaxed, - ) { - Ok(_) => { - return Ok(Suballocation { - offset, - size, - handle: AllocationHandle(ptr::null_mut()), - }); - } - Err(new_state) => { - state = new_state; - backoff.spin(); - } - } + if buffer_image_granularity != DeviceAlignment::MIN + && prev_end > 0 + && are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity) + && has_granularity_conflict(self.prev_allocation_type.get(), allocation_type) + { + offset = align_up(offset, buffer_image_granularity); } + + let relative_offset = offset - self.region_offset; + + let free_start = relative_offset + size; + + if free_start > self.region_size { + return Err(SuballocatorError::OutOfRegionMemory); + } + + self.free_start.set(free_start); + self.prev_allocation_type.set(allocation_type); + + Ok(Suballocation { + offset, + size, + handle: AllocationHandle(ptr::null_mut()), + }) } #[inline] @@ -1158,7 +1082,7 @@ unsafe impl Suballocator for BumpAllocator { #[inline] fn free_size(&self) -> DeviceSize { - self.region_size - (self.state.load(Ordering::Acquire) >> 2) + self.region_size - self.free_start.get() } #[inline] @@ -1303,6 +1227,7 @@ mod host { mod tests { use super::*; use crossbeam_queue::ArrayQueue; + use parking_lot::Mutex; use std::thread; const fn unwrap(opt: Option) -> T { @@ -1322,7 +1247,7 @@ mod tests { const REGION_SIZE: DeviceSize = (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD; - let allocator = FreeListAllocator::new(0, REGION_SIZE); + let allocator = Mutex::new(FreeListAllocator::new(0, REGION_SIZE)); let allocs = ArrayQueue::new((ALLOCATIONS_PER_THREAD * THREADS) as usize); // Using threads to randomize allocation order. @@ -1337,6 +1262,7 @@ mod tests { allocs .push( allocator + .lock() .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN) .unwrap(), ) @@ -1346,6 +1272,8 @@ mod tests { } }); + let allocator = allocator.into_inner(); + assert!(allocator .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN) .is_err()); @@ -1709,39 +1637,4 @@ mod tests { allocator.reset(); assert!(allocator.free_size() == REGION_SIZE); } - - #[test] - fn bump_allocator_syncness() { - const THREADS: DeviceSize = 12; - const ALLOCATIONS_PER_THREAD: DeviceSize = 100_000; - const ALLOCATION_STEP: DeviceSize = 117; - const REGION_SIZE: DeviceSize = - (ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD; - - let mut allocator = BumpAllocator::new(0, REGION_SIZE); - - thread::scope(|scope| { - for i in 1..=THREADS { - let allocator = &allocator; - - scope.spawn(move || { - let layout = DeviceLayout::from_size_alignment(i * ALLOCATION_STEP, 1).unwrap(); - - for _ in 0..ALLOCATIONS_PER_THREAD { - allocator - .allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN) - .unwrap(); - } - }); - } - }); - - assert!(allocator - .allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN) - .is_err()); - assert!(allocator.free_size() == 0); - - allocator.reset(); - assert!(allocator.free_size() == REGION_SIZE); - } }