Make the suballocators !Sync (#2317)

This commit is contained in:
marc0246 2023-09-07 09:39:47 +02:00 committed by GitHub
parent 64ea44c25e
commit c93d71e064
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 237 deletions

View File

@ -239,7 +239,7 @@ use crate::{
VulkanError,
};
use ash::vk::{MAX_MEMORY_HEAPS, MAX_MEMORY_TYPES};
use parking_lot::RwLock;
use parking_lot::Mutex;
use std::{
error::Error,
fmt::{Debug, Display, Error as FmtError, Formatter},
@ -878,7 +878,7 @@ pub struct GenericMemoryAllocator<S> {
#[derive(Debug)]
struct Pool<S> {
blocks: RwLock<Vec<Box<Block<S>>>>,
blocks: Mutex<Vec<Box<Block<S>>>>,
// This is cached here for faster access, so we don't need to hop through 3 pointers.
memory_type: ash::vk::MemoryType,
atom_size: DeviceAlignment,
@ -888,7 +888,7 @@ impl<S> GenericMemoryAllocator<S> {
// This is a false-positive, we only use this const for static initialization.
#[allow(clippy::declare_interior_mutable_const)]
const EMPTY_POOL: Pool<S> = Pool {
blocks: RwLock::new(Vec::new()),
blocks: Mutex::new(Vec::new()),
memory_type: ash::vk::MemoryType {
property_flags: ash::vk::MemoryPropertyFlags::empty(),
heap_index: 0,
@ -1068,7 +1068,7 @@ impl<S> GenericMemoryAllocator<S> {
}
}
unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
unsafe impl<S: Suballocator + Send + 'static> MemoryAllocator for GenericMemoryAllocator<S> {
fn find_memory_type_index(
&self,
memory_type_bits: u32,
@ -1145,64 +1145,19 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
layout = layout.align_to(pool.atom_size).unwrap();
let mut blocks = if S::IS_BLOCKING {
// If the allocation algorithm needs to block, then there's no point in trying to avoid
// locks here either. In that case the best strategy is to take full advantage of it by
// always taking an exclusive lock, which lets us sort the blocks by free size. If you
// as a user want to avoid locks, simply don't share the allocator between threads. You
// can create as many allocators as you wish, but keep in mind that that will waste a
// huge amount of memory unless you configure your block sizes properly!
let mut blocks = pool.blocks.lock();
let mut blocks = pool.blocks.write();
blocks.sort_by_key(|block| block.free_size());
let (Ok(idx) | Err(idx)) =
blocks.binary_search_by_key(&size, |block| block.free_size());
// TODO: Incremental sorting
blocks.sort_by_key(|block| block.free_size());
let (Ok(idx) | Err(idx)) = blocks.binary_search_by_key(&size, |block| block.free_size());
for block in &blocks[idx..] {
if let Ok(allocation) =
block.allocate(layout, allocation_type, self.buffer_image_granularity)
{
return Ok(allocation);
}
for block in &blocks[idx..] {
if let Ok(allocation) =
block.allocate(layout, allocation_type, self.buffer_image_granularity)
{
return Ok(allocation);
}
blocks
} else {
// If the allocation algorithm is lock-free, then we should avoid taking an exclusive
// lock unless it is absolutely neccessary (meaning, only when allocating a new
// `DeviceMemory` block and inserting it into a pool). This has the disadvantage that
// traversing the pool is O(n), which is not a problem since the number of blocks is
// expected to be small. If there are more than 10 blocks in a pool then that's a
// configuration error. Also, sorting the blocks before each allocation would be less
// efficient because to get the free size of the `PoolAllocator` and `BumpAllocator`
// has the same performance as trying to allocate.
let blocks = pool.blocks.read();
// Search in reverse order because we always append new blocks at the end.
for block in blocks.iter().rev() {
if let Ok(allocation) =
block.allocate(layout, allocation_type, self.buffer_image_granularity)
{
return Ok(allocation);
}
}
let len = blocks.len();
drop(blocks);
let blocks = pool.blocks.write();
if blocks.len() > len {
// Another thread beat us to it and inserted a fresh block, try to suballocate it.
if let Ok(allocation) =
blocks[len].allocate(layout, allocation_type, self.buffer_image_granularity)
{
return Ok(allocation);
}
}
blocks
};
}
// For bump allocators, first do a garbage sweep and try to allocate again.
if S::NEEDS_CLEANUP {
@ -1484,33 +1439,30 @@ unsafe impl<S: Suballocator + Send + Sync + 'static> MemoryAllocator for Generic
unsafe fn deallocate(&self, allocation: MemoryAlloc) {
if let Some(suballocation) = allocation.suballocation {
let memory_type_index = allocation.device_memory.memory_type_index();
let pool = self.pools[memory_type_index as usize].blocks.lock();
let block_ptr = allocation.allocation_handle.0 as *const Block<S>;
// TODO: Maybe do a similar check for dedicated blocks.
#[cfg(debug_assertions)]
{
let memory_type_index = allocation.device_memory.memory_type_index();
let pool = self.pools[memory_type_index as usize].blocks.read();
assert!(
pool.iter()
.any(|block| &**block as *const Block<S> == block_ptr),
"attempted to deallocate a memory block that does not belong to this allocator",
);
}
debug_assert!(
pool.iter()
.any(|block| &**block as *const Block<S> == block_ptr),
"attempted to deallocate a memory block that does not belong to this allocator",
);
// SAFETY: The caller must guarantee that `allocation` refers to one allocated by
// `self`, therefore `block_ptr` must be the same one we gave out on allocation. We
// know that this pointer must be valid, because all blocks are boxed and pinned in
// memory and because a block isn't dropped until the allocator itself is dropped, at
// which point it would be impossible to call this method. We also know that it must be
// valid to create a reference to the block, because we only ever access it via shared
// references.
// valid to create a reference to the block, because we locked the pool it belongs to.
let block = &*block_ptr;
// SAFETY: The caller must guarantee that `allocation` refers to a currently allocated
// allocation of `self`.
block.deallocate(suballocation);
drop(pool);
}
}
}

View File

@ -18,14 +18,12 @@ use super::{
align_down, align_up, array_vec::ArrayVec, AllocationHandle, DeviceAlignment, DeviceLayout,
};
use crate::{image::ImageTiling, memory::is_aligned, DeviceSize, NonZeroDeviceSize};
use parking_lot::Mutex;
use std::{
cell::Cell,
cell::{Cell, UnsafeCell},
cmp,
error::Error,
fmt::{self, Display},
ptr,
sync::atomic::{AtomicU64, Ordering},
};
/// Suballocators are used to divide a *region* into smaller *suballocations*.
@ -69,14 +67,6 @@ use std::{
/// [`DeviceMemory`]: crate::memory::DeviceMemory
/// [pages]: super#pages
pub unsafe trait Suballocator {
/// Whether this allocator needs to block or not.
///
/// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
/// suballocator at compile time.
///
/// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator
const IS_BLOCKING: bool;
/// Whether the allocator needs [`cleanup`] to be called before memory can be released.
///
/// This is used by the [`GenericMemoryAllocator`] to specialize the allocation strategy to the
@ -280,13 +270,11 @@ impl Display for SuballocatorError {
pub struct FreeListAllocator {
region_offset: DeviceSize,
// Total memory remaining in the region.
free_size: AtomicU64,
state: Mutex<FreeListAllocatorState>,
free_size: Cell<DeviceSize>,
state: UnsafeCell<FreeListAllocatorState>,
}
unsafe impl Suballocator for FreeListAllocator {
const IS_BLOCKING: bool = true;
const NEEDS_CLEANUP: bool = false;
/// Creates a new `FreeListAllocator` for the given [region].
@ -296,7 +284,7 @@ unsafe impl Suballocator for FreeListAllocator {
// NOTE(Marc): This number was pulled straight out of my a-
const AVERAGE_ALLOCATION_SIZE: DeviceSize = 64 * 1024;
let free_size = AtomicU64::new(region_size);
let free_size = Cell::new(region_size);
let capacity = (region_size / AVERAGE_ALLOCATION_SIZE) as usize;
let mut nodes = host::PoolAllocator::new(capacity + 64);
@ -309,7 +297,7 @@ unsafe impl Suballocator for FreeListAllocator {
ty: SuballocationType::Free,
});
free_list.push(root_id);
let state = Mutex::new(FreeListAllocatorState { nodes, free_list });
let state = UnsafeCell::new(FreeListAllocatorState { nodes, free_list });
FreeListAllocator {
region_offset,
@ -337,7 +325,7 @@ unsafe impl Suballocator for FreeListAllocator {
let size = layout.size();
let alignment = layout.alignment();
let mut state = self.state.lock();
let state = unsafe { &mut *self.state.get() };
unsafe {
match state.free_list.last() {
@ -392,7 +380,7 @@ unsafe impl Suballocator for FreeListAllocator {
// This can't overflow because suballocation sizes in the free-list are
// constrained by the remaining size of the region.
self.free_size.fetch_sub(size, Ordering::Release);
self.free_size.set(self.free_size.get() - size);
return Ok(Suballocation {
offset,
@ -421,14 +409,14 @@ unsafe impl Suballocator for FreeListAllocator {
// allocation of `self`.
let node_id = SlotId::new(suballocation.handle.0 as _);
let mut state = self.state.lock();
let state = unsafe { &mut *self.state.get() };
let node = state.nodes.get_mut(node_id);
debug_assert!(node.ty != SuballocationType::Free);
// Suballocation sizes are constrained by the size of the region, so they can't possibly
// overflow when added up.
self.free_size.fetch_add(node.size, Ordering::Release);
self.free_size.set(self.free_size.get() + node.size);
node.ty = SuballocationType::Free;
state.coalesce(node_id);
@ -437,7 +425,7 @@ unsafe impl Suballocator for FreeListAllocator {
#[inline]
fn free_size(&self) -> DeviceSize {
self.free_size.load(Ordering::Acquire)
self.free_size.get()
}
#[inline]
@ -748,8 +736,8 @@ impl FreeListAllocatorState {
pub struct BuddyAllocator {
region_offset: DeviceSize,
// Total memory remaining in the region.
free_size: AtomicU64,
state: Mutex<BuddyAllocatorState>,
free_size: Cell<DeviceSize>,
state: UnsafeCell<BuddyAllocatorState>,
}
impl BuddyAllocator {
@ -761,8 +749,6 @@ impl BuddyAllocator {
}
unsafe impl Suballocator for BuddyAllocator {
const IS_BLOCKING: bool = true;
const NEEDS_CLEANUP: bool = false;
/// Creates a new `BuddyAllocator` for the given [region].
@ -783,13 +769,13 @@ unsafe impl Suballocator for BuddyAllocator {
assert!(max_order < BuddyAllocator::MAX_ORDERS);
let free_size = AtomicU64::new(region_size);
let free_size = Cell::new(region_size);
let mut free_list =
ArrayVec::new(max_order + 1, [EMPTY_FREE_LIST; BuddyAllocator::MAX_ORDERS]);
// The root node has the lowest offset and highest order, so it's the whole region.
free_list[max_order].push(region_offset);
let state = Mutex::new(BuddyAllocatorState { free_list });
let state = UnsafeCell::new(BuddyAllocatorState { free_list });
BuddyAllocator {
region_offset,
@ -840,7 +826,7 @@ unsafe impl Suballocator for BuddyAllocator {
let size = cmp::max(size, BuddyAllocator::MIN_NODE_SIZE).next_power_of_two();
let min_order = (size / BuddyAllocator::MIN_NODE_SIZE).trailing_zeros() as usize;
let mut state = self.state.lock();
let state = unsafe { &mut *self.state.get() };
// Start searching at the lowest possible order going up.
for (order, free_list) in state.free_list.iter_mut().enumerate().skip(min_order) {
@ -875,7 +861,7 @@ unsafe impl Suballocator for BuddyAllocator {
// This can't overflow because suballocation sizes in the free-list are
// constrained by the remaining size of the region.
self.free_size.fetch_sub(size, Ordering::Release);
self.free_size.set(self.free_size.get() - size);
return Ok(Suballocation {
offset,
@ -900,7 +886,7 @@ unsafe impl Suballocator for BuddyAllocator {
let order = suballocation.handle.0 as usize;
let min_order = order;
let mut state = self.state.lock();
let state = unsafe { &mut *self.state.get() };
debug_assert!(!state.free_list[order].contains(&offset));
@ -930,7 +916,7 @@ unsafe impl Suballocator for BuddyAllocator {
// The sizes of suballocations allocated by `self` are constrained by that of
// its region, so they can't possibly overflow when added up.
self.free_size.fetch_add(size, Ordering::Release);
self.free_size.set(self.free_size.get() + size);
break;
}
@ -945,7 +931,7 @@ unsafe impl Suballocator for BuddyAllocator {
/// [internal fragmentation]: super#internal-fragmentation
#[inline]
fn free_size(&self) -> DeviceSize {
self.free_size.load(Ordering::Acquire)
self.free_size.get()
}
#[inline]
@ -1014,9 +1000,8 @@ struct BuddyAllocatorState {
pub struct BumpAllocator {
region_offset: DeviceSize,
region_size: DeviceSize,
// Encodes the previous allocation type in the 2 least signifficant bits and the free start in
// the rest.
state: AtomicU64,
free_start: Cell<DeviceSize>,
prev_allocation_type: Cell<AllocationType>,
}
impl BumpAllocator {
@ -1025,29 +1010,23 @@ impl BumpAllocator {
/// [region]: Suballocator#regions
#[inline]
pub fn reset(&mut self) {
*self.state.get_mut() = AllocationType::Unknown as DeviceSize;
*self.free_start.get_mut() = 0;
*self.prev_allocation_type.get_mut() = AllocationType::Unknown;
}
}
unsafe impl Suballocator for BumpAllocator {
const IS_BLOCKING: bool = false;
const NEEDS_CLEANUP: bool = true;
/// Creates a new `BumpAllocator` for the given [region].
///
/// [region]: Suballocator#regions
fn new(region_offset: DeviceSize, region_size: DeviceSize) -> Self {
// Sanity check: this would lead to UB because of the left-shifting by 2 needed to encode
// the free-start into the state.
assert!(region_size <= (DeviceLayout::MAX_SIZE >> 2));
let state = AtomicU64::new(AllocationType::Unknown as DeviceSize);
BumpAllocator {
region_offset,
region_size,
state,
free_start: Cell::new(0),
prev_allocation_type: Cell::new(AllocationType::Unknown),
}
}
@ -1058,97 +1037,42 @@ unsafe impl Suballocator for BumpAllocator {
allocation_type: AllocationType,
buffer_image_granularity: DeviceAlignment,
) -> Result<Suballocation, SuballocatorError> {
const SPIN_LIMIT: u32 = 6;
// NOTE(Marc): The following code is a minimal version `Backoff` taken from
// crossbeam_utils v0.8.11, because we didn't want to add a dependency for a couple lines
// that are used in one place only.
/// Original documentation:
/// https://docs.rs/crossbeam-utils/0.8.11/crossbeam_utils/struct.Backoff.html
struct Backoff {
step: Cell<u32>,
}
impl Backoff {
fn new() -> Self {
Backoff { step: Cell::new(0) }
}
fn spin(&self) {
for _ in 0..1 << self.step.get().min(SPIN_LIMIT) {
core::hint::spin_loop();
}
if self.step.get() <= SPIN_LIMIT {
self.step.set(self.step.get() + 1);
}
}
}
fn has_granularity_conflict(prev_ty: AllocationType, ty: AllocationType) -> bool {
prev_ty == AllocationType::Unknown || prev_ty != ty
}
let size = layout.size();
let alignment = layout.alignment();
let backoff = Backoff::new();
let mut state = self.state.load(Ordering::Relaxed);
loop {
let free_start = state >> 2;
// These can't overflow because offsets are constrained by the size of the root
// allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
let prev_end = self.region_offset + self.free_start.get();
let mut offset = align_up(prev_end, alignment);
// These can't overflow because offsets are constrained by the size of the root
// allocation, which can itself not exceed `DeviceLayout::MAX_SIZE`.
let prev_end = self.region_offset + free_start;
let mut offset = align_up(prev_end, alignment);
if buffer_image_granularity != DeviceAlignment::MIN {
let prev_alloc_type = match state & 0b11 {
0 => AllocationType::Unknown,
1 => AllocationType::Linear,
2 => AllocationType::NonLinear,
_ => unreachable!(),
};
if prev_end > 0
&& are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
&& has_granularity_conflict(prev_alloc_type, allocation_type)
{
offset = align_up(offset, buffer_image_granularity);
}
}
let relative_offset = offset - self.region_offset;
let free_start = relative_offset + size;
if free_start > self.region_size {
return Err(SuballocatorError::OutOfRegionMemory);
}
// This can't discard any bits because we checked that `region_size` does not exceed
// `DeviceLayout::MAX_SIZE >> 2`.
let new_state = free_start << 2 | allocation_type as DeviceSize;
match self.state.compare_exchange_weak(
state,
new_state,
Ordering::Release,
Ordering::Relaxed,
) {
Ok(_) => {
return Ok(Suballocation {
offset,
size,
handle: AllocationHandle(ptr::null_mut()),
});
}
Err(new_state) => {
state = new_state;
backoff.spin();
}
}
if buffer_image_granularity != DeviceAlignment::MIN
&& prev_end > 0
&& are_blocks_on_same_page(0, prev_end, offset, buffer_image_granularity)
&& has_granularity_conflict(self.prev_allocation_type.get(), allocation_type)
{
offset = align_up(offset, buffer_image_granularity);
}
let relative_offset = offset - self.region_offset;
let free_start = relative_offset + size;
if free_start > self.region_size {
return Err(SuballocatorError::OutOfRegionMemory);
}
self.free_start.set(free_start);
self.prev_allocation_type.set(allocation_type);
Ok(Suballocation {
offset,
size,
handle: AllocationHandle(ptr::null_mut()),
})
}
#[inline]
@ -1158,7 +1082,7 @@ unsafe impl Suballocator for BumpAllocator {
#[inline]
fn free_size(&self) -> DeviceSize {
self.region_size - (self.state.load(Ordering::Acquire) >> 2)
self.region_size - self.free_start.get()
}
#[inline]
@ -1303,6 +1227,7 @@ mod host {
mod tests {
use super::*;
use crossbeam_queue::ArrayQueue;
use parking_lot::Mutex;
use std::thread;
const fn unwrap<T: Copy>(opt: Option<T>) -> T {
@ -1322,7 +1247,7 @@ mod tests {
const REGION_SIZE: DeviceSize =
(ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
let allocator = FreeListAllocator::new(0, REGION_SIZE);
let allocator = Mutex::new(FreeListAllocator::new(0, REGION_SIZE));
let allocs = ArrayQueue::new((ALLOCATIONS_PER_THREAD * THREADS) as usize);
// Using threads to randomize allocation order.
@ -1337,6 +1262,7 @@ mod tests {
allocs
.push(
allocator
.lock()
.allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
.unwrap(),
)
@ -1346,6 +1272,8 @@ mod tests {
}
});
let allocator = allocator.into_inner();
assert!(allocator
.allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
.is_err());
@ -1709,39 +1637,4 @@ mod tests {
allocator.reset();
assert!(allocator.free_size() == REGION_SIZE);
}
#[test]
fn bump_allocator_syncness() {
const THREADS: DeviceSize = 12;
const ALLOCATIONS_PER_THREAD: DeviceSize = 100_000;
const ALLOCATION_STEP: DeviceSize = 117;
const REGION_SIZE: DeviceSize =
(ALLOCATION_STEP * (THREADS + 1) * THREADS / 2) * ALLOCATIONS_PER_THREAD;
let mut allocator = BumpAllocator::new(0, REGION_SIZE);
thread::scope(|scope| {
for i in 1..=THREADS {
let allocator = &allocator;
scope.spawn(move || {
let layout = DeviceLayout::from_size_alignment(i * ALLOCATION_STEP, 1).unwrap();
for _ in 0..ALLOCATIONS_PER_THREAD {
allocator
.allocate(layout, AllocationType::Unknown, DeviceAlignment::MIN)
.unwrap();
}
});
}
});
assert!(allocator
.allocate(DUMMY_LAYOUT, AllocationType::Unknown, DeviceAlignment::MIN)
.is_err());
assert!(allocator.free_size() == 0);
allocator.reset();
assert!(allocator.free_size() == REGION_SIZE);
}
}