CpuBufferPool revamp (#2076)

* `CpuBufferPool` revamp * Fix oopsie * Fix docs
2024-11-25 16:25:31 +00:00 · 2022-11-05 08:50:46 +01:00 · 2022-11-05 08:50:46 +01:00 · fe01ddd5e3
commit fe01ddd5e3
parent c5c6bf0f09
8 changed files with 671 additions and 1016 deletions
--- a/examples/src/bin/buffer-allocator.rs
+++ b/examples/src/bin/buffer-allocator.rs
@ -7,17 +7,7 @@
 // notice may not be copied, modified, or distributed except
 // according to those terms.

-// BufferPool Example
-//
-// Modified triangle example to show BufferPool
-// Using a pool allows multiple buffers to be "in-flight" simultaneously
-//  and is suited to highly dynamic, similar sized chunks of data
-//
-// NOTE:(jdnewman85) ATM (5/4/2020) CpuBufferPool.next() and .chunk() have identical documentation
-//      I was unable to get next() to work. The compiler complained that the resulting buffer
-//      didn't implement VertexSource. Similar issues have been reported.
-//      See: https://github.com/vulkano-rs/vulkano/issues/1221
-//      Finally, I have not profiled CpuBufferPool against CpuAccessibleBuffer
+// Modified triangle example to show `CpuBufferAllocator`.

 use bytemuck::{Pod, Zeroable};
 use std::{
@ -25,7 +15,10 @@ use std::{
    time::{SystemTime, UNIX_EPOCH},
 };
 use vulkano::{
-    buffer::CpuBufferPool,
+    buffer::{
+        allocator::{CpuBufferAllocator, CpuBufferAllocatorCreateInfo},
+        BufferUsage,
+    },
    command_buffer::{
        allocator::StandardCommandBufferAllocator, AutoCommandBufferBuilder, CommandBufferUsage,
        RenderPassBeginInfo, SubpassContents,
@ -171,8 +164,16 @@ fn main() {

    let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone()));

-    // Vertex Buffer Pool
-    let buffer_pool: CpuBufferPool<Vertex> = CpuBufferPool::vertex_buffer(memory_allocator);
+    // Using a buffer allocator allows multiple buffers to be "in-flight" simultaneously and is
+    // suited to highly dynamic data like vertex, index and uniform buffers.
+    let buffer_allocator = CpuBufferAllocator::new(
+        memory_allocator,
+        CpuBufferAllocatorCreateInfo {
+            // We want to use the allocated subbuffers as vertex buffers.
+            buffer_usage: BufferUsage::VERTEX_BUFFER,
+            ..Default::default()
+        },
+    );

    mod vs {
        vulkano_shaders::shader! {
@ -335,8 +336,8 @@ fn main() {
                ];
                let num_vertices = data.len() as u32;

-                // Allocate a new chunk from buffer_pool
-                let buffer = buffer_pool.from_iter(data.to_vec()).unwrap();
+                // Allocate a new subbuffer using the buffer allocator.
+                let buffer = buffer_allocator.from_iter(data.iter().copied()).unwrap();
                let mut builder = AutoCommandBufferBuilder::primary(
                    &command_buffer_allocator,
                    queue.queue_family_index(),
--- a/examples/src/bin/indirect.rs
+++ b/examples/src/bin/indirect.rs
@ -27,7 +27,10 @@
 use bytemuck::{Pod, Zeroable};
 use std::sync::Arc;
 use vulkano::{
-    buffer::{BufferUsage, CpuBufferPool},
+    buffer::{
+        allocator::{CpuBufferAllocator, CpuBufferAllocatorCreateInfo},
+        BufferUsage,
+    },
    command_buffer::{
        allocator::StandardCommandBufferAllocator, AutoCommandBufferBuilder, CommandBufferUsage,
        DrawIndirectCommand, RenderPassBeginInfo, SubpassContents,
@ -42,7 +45,7 @@ use vulkano::{
    image::{view::ImageView, ImageAccess, ImageUsage, SwapchainImage},
    impl_vertex,
    instance::{Instance, InstanceCreateInfo},
-    memory::allocator::{MemoryUsage, StandardMemoryAllocator},
+    memory::allocator::StandardMemoryAllocator,
    pipeline::{
        graphics::{
            input_assembly::InputAssemblyState,
@ -256,17 +259,21 @@ fn main() {

    let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone()));

-    // Each frame we generate a new set of vertices and each frame we need a new DrawIndirectCommand struct to
-    // set the number of vertices to draw
-    let indirect_args_pool: CpuBufferPool<DrawIndirectCommand> = CpuBufferPool::new(
+    // Each frame we generate a new set of vertices and each frame we need a new
+    // DrawIndirectCommand struct to set the number of vertices to draw.
+    let indirect_args_pool = CpuBufferAllocator::new(
        memory_allocator.clone(),
-        BufferUsage::INDIRECT_BUFFER | BufferUsage::STORAGE_BUFFER,
-        MemoryUsage::Upload,
+        CpuBufferAllocatorCreateInfo {
+            buffer_usage: BufferUsage::INDIRECT_BUFFER | BufferUsage::STORAGE_BUFFER,
+            ..Default::default()
+        },
    );
-    let vertex_pool: CpuBufferPool<Vertex> = CpuBufferPool::new(
+    let vertex_pool = CpuBufferAllocator::new(
        memory_allocator,
-        BufferUsage::STORAGE_BUFFER | BufferUsage::VERTEX_BUFFER,
-        MemoryUsage::Upload,
+        CpuBufferAllocatorCreateInfo {
+            buffer_usage: BufferUsage::STORAGE_BUFFER | BufferUsage::VERTEX_BUFFER,
+            ..Default::default()
+        },
    );

    let compute_pipeline = ComputePipeline::new(
--- a/examples/src/bin/teapot/main.rs
+++ b/examples/src/bin/teapot/main.rs
@ -11,7 +11,10 @@ use cgmath::{Matrix3, Matrix4, Point3, Rad, Vector3};
 use examples::{Normal, Vertex, INDICES, NORMALS, VERTICES};
 use std::{sync::Arc, time::Instant};
 use vulkano::{
-    buffer::{BufferUsage, CpuAccessibleBuffer, CpuBufferPool, TypedBufferAccess},
+    buffer::{
+        allocator::{CpuBufferAllocator, CpuBufferAllocatorCreateInfo},
+        BufferUsage, CpuAccessibleBuffer, TypedBufferAccess,
+    },
    command_buffer::{
        allocator::StandardCommandBufferAllocator, AutoCommandBufferBuilder, CommandBufferUsage,
        RenderPassBeginInfo, SubpassContents,
@ -26,7 +29,7 @@ use vulkano::{
    format::Format,
    image::{view::ImageView, AttachmentImage, ImageAccess, ImageUsage, SwapchainImage},
    instance::{Instance, InstanceCreateInfo},
-    memory::allocator::{MemoryUsage, StandardMemoryAllocator},
+    memory::allocator::StandardMemoryAllocator,
    pipeline::{
        graphics::{
            depth_stencil::DepthStencilState,
@ -180,10 +183,12 @@ fn main() {
    )
    .unwrap();

-    let uniform_buffer = CpuBufferPool::<vs::ty::Data>::new(
+    let uniform_buffer = CpuBufferAllocator::new(
        memory_allocator.clone(),
-        BufferUsage::UNIFORM_BUFFER,
-        MemoryUsage::Upload,
+        CpuBufferAllocatorCreateInfo {
+            buffer_usage: BufferUsage::UNIFORM_BUFFER,
+            ..Default::default()
+        },
    );

    let vs = vs::load(device.clone()).unwrap();
--- a/vulkano/src/buffer/allocator.rs
+++ b/vulkano/src/buffer/allocator.rs
@ -0,0 +1,566 @@
+// Copyright (c) 2017 The vulkano developers
+// Licensed under the Apache License, Version 2.0
+// <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT
+// license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
+// at your option. All files in the project carrying such
+// notice may not be copied, modified, or distributed except
+// according to those terms.
+
+//! Efficiently suballocates buffers into smaller subbuffers.
+
+use super::{
+    sys::{Buffer, BufferCreateInfo, RawBuffer},
+    BufferAccess, BufferAccessObject, BufferContents, BufferError, BufferInner, BufferUsage,
+    TypedBufferAccess,
+};
+use crate::{
+    buffer::sys::BufferMemory,
+    device::{Device, DeviceOwned},
+    memory::{
+        allocator::{
+            align_up, AllocationCreateInfo, AllocationCreationError, AllocationType,
+            MemoryAllocatePreference, MemoryAllocator, MemoryUsage, StandardMemoryAllocator,
+        },
+        DedicatedAllocation,
+    },
+    DeviceSize,
+};
+use crossbeam_queue::ArrayQueue;
+use std::{
+    cell::UnsafeCell,
+    marker::PhantomData,
+    mem::{align_of, size_of, ManuallyDrop},
+    num::NonZeroU64,
+    ptr,
+    sync::Arc,
+};
+
+const MAX_ARENAS: usize = 32;
+
+// TODO: Add `CpuSubbuffer::read` to read the content of a subbuffer.
+//       But that's hard to do because we must prevent `increase_gpu_lock` from working while a
+//       a buffer is locked.
+
+/// Efficiently suballocates buffers into smaller subbuffers.
+///
+/// This allocator is especially suitable when you want to upload or download some data regularly
+/// (for example, at each frame for a video game).
+///
+/// # Algorithm
+///
+/// The allocator keeps a pool of *arenas*. An arena is simply a buffer in which *arena allocation*
+/// takes place, also known as *bump allocation* or *linear allocation*. Every time you allocate,
+/// one of these arenas is suballocated. If there is no arena that is currently available, one will
+/// be allocated. After all subbuffers allocated from an arena are dropped, the arena is
+/// automatically returned to the arena pool. If you try to allocate a subbuffer larger than the
+/// current size of an arena, the arenas are automatically resized.
+///
+/// No memory is allocated when the allocator is created, be it on the Vulkan or Rust side. That
+/// only happens once you allocate a subbuffer.
+///
+/// # Usage
+///
+/// Ideally, one arena should be able to fit all data you need to update per frame, so that each
+/// arena is submitted and freed once per frame. This way, the arena pool would also contain as
+/// many arenas as there are frames in flight on the thread. Otherwise, if your arenas are not able
+/// to fit everything each frame, what will likely happen is that each subbuffer will be
+/// allocated from an individual arena. This can impact efficiency both in terms of memory usage
+/// (because each arena has the same size, even if some of the subbuffers are way smaller) as well
+/// as performance, because the data could end up more physically separated in memory, which means
+/// the GPU would need to hop from place to place a lot more during a frame.
+///
+/// Ideally the result is something roughly like this:
+///
+/// ```plain
+/// +---------------------------------------------------------------------------------------------+
+/// |                                        Memory Block                                         |
+/// |-----+------+-----------------------+---------+-----------------------+------+---------+-----|
+/// |     |      |     Frame 1 Arena     |         |     Frame 2 Arena     |      |         |     |
+/// | ••• | Tex. |-------+-------+-------| Attach. |-------+-------+-------| Tex. | Attach. | ••• |
+/// |     |      | Vert. | Indx. | Unif. |         | Vert. | Indx. | Unif. |      |         |     |
+/// +-----+------+-------+-------+-------+---------+-------+-------+-------+------+---------+-----+
+/// ```
+///
+/// # Examples
+///
+/// ```
+/// use vulkano::buffer::allocator::CpuBufferAllocator;
+/// use vulkano::command_buffer::{
+///     AutoCommandBufferBuilder, CommandBufferUsage, PrimaryCommandBufferAbstract,
+/// };
+/// use vulkano::sync::GpuFuture;
+/// # let queue: std::sync::Arc<vulkano::device::Queue> = return;
+/// # let memory_allocator: std::sync::Arc<vulkano::memory::allocator::StandardMemoryAllocator> = return;
+/// # let command_buffer_allocator: vulkano::command_buffer::allocator::StandardCommandBufferAllocator = return;
+///
+/// // Create the buffer allocator.
+/// let buffer_allocator = CpuBufferAllocator::new(memory_allocator.clone(), Default::default());
+///
+/// for n in 0..25u32 {
+///     // Each loop allocates a new subbuffer and stores `data` in it.
+///     let data: [f32; 4] = [1.0, 0.5, n as f32 / 24.0, 0.0];
+///     let subbuffer = buffer_allocator.from_data(data).unwrap();
+///
+///     // You can then use `subbuffer` as if it was an entirely separate buffer.
+///     AutoCommandBufferBuilder::primary(
+///         &command_buffer_allocator,
+///         queue.queue_family_index(),
+///         CommandBufferUsage::OneTimeSubmit,
+///     )
+///     .unwrap()
+///     // For the sake of the example we just call `update_buffer` on the buffer, even though
+///     // it is pointless to do that.
+///     .update_buffer(&[0.2, 0.3, 0.4, 0.5], subbuffer.clone(), 0)
+///     .unwrap()
+///     .build().unwrap()
+///     .execute(queue.clone())
+///     .unwrap()
+///     .then_signal_fence_and_flush()
+///     .unwrap();
+/// }
+/// ```
+#[derive(Debug)]
+pub struct CpuBufferAllocator<A = Arc<StandardMemoryAllocator>> {
+    state: UnsafeCell<CpuBufferAllocatorState<A>>,
+}
+
+impl<A> CpuBufferAllocator<A>
+where
+    A: MemoryAllocator,
+{
+    /// Creates a new `CpuBufferAllocator`.
+    ///
+    /// # Panics
+    ///
+    /// - Panics if `create_info.memory_usage` is [`MemoryUsage::GpuOnly`].
+    pub fn new(memory_allocator: A, create_info: CpuBufferAllocatorCreateInfo) -> Self {
+        let CpuBufferAllocatorCreateInfo {
+            arena_size,
+            buffer_usage,
+            memory_usage,
+            _ne: _,
+        } = create_info;
+
+        assert!(memory_usage != MemoryUsage::GpuOnly);
+
+        let properties = memory_allocator.device().physical_device().properties();
+        let buffer_alignment = [
+            buffer_usage
+                .contains(BufferUsage::UNIFORM_BUFFER)
+                .then_some(properties.min_uniform_buffer_offset_alignment),
+            buffer_usage
+                .contains(BufferUsage::STORAGE_BUFFER)
+                .then_some(properties.min_storage_buffer_offset_alignment),
+        ]
+        .into_iter()
+        .flatten()
+        .max()
+        .unwrap_or(1);
+
+        CpuBufferAllocator {
+            state: UnsafeCell::new(CpuBufferAllocatorState {
+                memory_allocator,
+                buffer_usage,
+                memory_usage,
+                buffer_alignment,
+                arena_size,
+                arena: None,
+                free_start: 0,
+                reserve: None,
+            }),
+        }
+    }
+
+    /// Returns the current size of the arenas.
+    pub fn arena_size(&self) -> DeviceSize {
+        unsafe { &*self.state.get() }.arena_size
+    }
+
+    /// Sets the arena size to the provided `size`.
+    ///
+    /// The next time you allocate a subbuffer, a new arena will be allocated with the new size,
+    /// and all subsequently allocated arenas will also share the new size.
+    pub fn set_arena_size(&self, size: DeviceSize) {
+        let state = unsafe { &mut *self.state.get() };
+        state.arena_size = size;
+        state.arena = None;
+        state.reserve = None;
+    }
+
+    /// Ensures that the size of the current arena is at least `size`.
+    ///
+    /// If `size` is greater than the current arena size, then a new arena will be allocated with
+    /// the new size, and all subsequently allocated arenas will also share the new size. Otherwise
+    /// this has no effect.
+    pub fn reserve(&self, size: DeviceSize) -> Result<(), AllocationCreationError> {
+        if size > self.arena_size() {
+            let state = unsafe { &mut *self.state.get() };
+            state.arena_size = size;
+            state.reserve = None;
+            state.arena = Some(state.next_arena()?);
+        }
+
+        Ok(())
+    }
+
+    /// Allocates a subbuffer and writes `data` in it.
+    ///
+    /// # Panics
+    ///
+    /// - Panics if `T` has zero size.
+    /// - Panics if `T` has an alignment greater than `64`.
+    pub fn from_data<T>(&self, data: T) -> Result<Arc<CpuSubbuffer<T>>, AllocationCreationError>
+    where
+        T: BufferContents,
+    {
+        assert!(size_of::<T>() > 0);
+        assert!(align_of::<T>() <= 64);
+
+        let state = unsafe { &mut *self.state.get() };
+
+        let size = size_of::<T>() as DeviceSize;
+        let offset = state.allocate(size, align_of::<T>() as DeviceSize)?;
+        let arena = state.arena.as_ref().unwrap().clone();
+        let allocation = match arena.inner.memory() {
+            BufferMemory::Normal(a) => a,
+            BufferMemory::Sparse => unreachable!(),
+        };
+
+        unsafe {
+            let bytes = allocation.write(offset..offset + size).unwrap();
+            let mapping = T::from_bytes_mut(bytes).unwrap();
+
+            ptr::write(mapping, data);
+
+            if let Some(atom_size) = allocation.atom_size() {
+                let size = align_up(size, atom_size.get());
+                let end = DeviceSize::min(offset + size, allocation.size());
+                allocation.flush_range(offset..end).unwrap();
+            }
+        }
+
+        Ok(Arc::new(CpuSubbuffer {
+            id: CpuSubbuffer::<T>::next_id(),
+            offset,
+            size,
+            arena,
+            _marker: PhantomData,
+        }))
+    }
+
+    /// Allocates a subbuffer and writes all elements of `iter` in it.
+    ///
+    /// # Panics
+    ///
+    /// - Panics if `T` has zero size.
+    /// - Panics if `T` has an alignment greater than `64`.
+    pub fn from_iter<T, I>(
+        &self,
+        iter: I,
+    ) -> Result<Arc<CpuSubbuffer<[T]>>, AllocationCreationError>
+    where
+        [T]: BufferContents,
+        I: IntoIterator<Item = T>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        assert!(size_of::<T>() > 0);
+        assert!(align_of::<T>() <= 64);
+
+        let iter = iter.into_iter();
+        let state = unsafe { &mut *self.state.get() };
+
+        let size = (size_of::<T>() * iter.len()) as DeviceSize;
+        let offset = state.allocate(size, align_of::<T>() as DeviceSize)?;
+        let arena = state.arena.as_ref().unwrap().clone();
+        let allocation = match arena.inner.memory() {
+            BufferMemory::Normal(a) => a,
+            BufferMemory::Sparse => unreachable!(),
+        };
+
+        unsafe {
+            let bytes = allocation.write(offset..offset + size).unwrap();
+            let mapping = <[T]>::from_bytes_mut(bytes).unwrap();
+
+            for (o, i) in mapping.iter_mut().zip(iter) {
+                ptr::write(o, i);
+            }
+
+            if let Some(atom_size) = allocation.atom_size() {
+                let size = align_up(size, atom_size.get());
+                let end = DeviceSize::min(offset + size, allocation.size());
+                allocation.flush_range(offset..end).unwrap();
+            }
+        }
+
+        Ok(Arc::new(CpuSubbuffer {
+            id: CpuSubbuffer::<T>::next_id(),
+            offset,
+            size,
+            arena,
+            _marker: PhantomData,
+        }))
+    }
+}
+
+#[derive(Debug)]
+struct CpuBufferAllocatorState<A> {
+    memory_allocator: A,
+    buffer_usage: BufferUsage,
+    memory_usage: MemoryUsage,
+    // The alignment required for the subbuffers.
+    buffer_alignment: DeviceSize,
+    // The current size of the arenas.
+    arena_size: DeviceSize,
+    // Contains the buffer that is currently being suballocated.
+    arena: Option<Arc<Arena>>,
+    // Offset pointing to the start of free memory within the arena.
+    free_start: DeviceSize,
+    // When an `Arena` is dropped, it returns itself here for reuse.
+    reserve: Option<Arc<ArrayQueue<Arc<Buffer>>>>,
+}
+
+impl<A> CpuBufferAllocatorState<A>
+where
+    A: MemoryAllocator,
+{
+    fn allocate(
+        &mut self,
+        size: DeviceSize,
+        alignment: DeviceSize,
+    ) -> Result<DeviceSize, AllocationCreationError> {
+        let alignment = DeviceSize::max(alignment, self.buffer_alignment);
+
+        loop {
+            if self.arena.is_none() {
+                // If the requested size is larger than the arenas, we need to resize them.
+                if self.arena_size < size {
+                    self.arena_size = size * 2;
+                    // We need to drop our reference to the old pool to make sure the arenas are
+                    // dropped once no longer in use, and replace it with a new pool that will not
+                    // be polluted with the outdates arenas.
+                    self.reserve = None;
+                }
+                self.arena = Some(self.next_arena()?);
+                self.free_start = 0;
+            }
+
+            let arena = self.arena.as_ref().unwrap();
+            let allocation = match arena.inner.memory() {
+                BufferMemory::Normal(a) => a,
+                BufferMemory::Sparse => unreachable!(),
+            };
+            let arena_offset = allocation.offset();
+            let atom_size = allocation.atom_size().map(NonZeroU64::get).unwrap_or(1);
+
+            let alignment = DeviceSize::max(alignment, atom_size);
+            let offset = align_up(arena_offset + self.free_start, alignment);
+
+            if offset + size <= arena_offset + self.arena_size {
+                let offset = offset - arena_offset;
+                self.free_start = offset + size;
+
+                return Ok(offset);
+            }
+
+            // We reached the end of the arena, grab the next one.
+            self.arena = None;
+        }
+    }
+
+    fn next_arena(&mut self) -> Result<Arc<Arena>, AllocationCreationError> {
+        if self.reserve.is_none() {
+            self.reserve = Some(Arc::new(ArrayQueue::new(MAX_ARENAS)));
+        }
+        let reserve = self.reserve.as_ref().unwrap();
+
+        reserve
+            .pop()
+            .map(Ok)
+            .unwrap_or_else(|| self.create_arena())
+            .map(|inner| {
+                Arc::new(Arena {
+                    inner: ManuallyDrop::new(inner),
+                    reserve: reserve.clone(),
+                })
+            })
+    }
+
+    fn create_arena(&self) -> Result<Arc<Buffer>, AllocationCreationError> {
+        let raw_buffer = RawBuffer::new(
+            self.memory_allocator.device().clone(),
+            BufferCreateInfo {
+                size: self.arena_size,
+                usage: self.buffer_usage,
+                ..Default::default()
+            },
+        )
+        .map_err(|err| match err {
+            BufferError::AllocError(err) => err,
+            // We don't use sparse-binding, therefore the other errors can't happen.
+            _ => unreachable!(),
+        })?;
+        let mut requirements = *raw_buffer.memory_requirements();
+        requirements.alignment = DeviceSize::max(requirements.alignment, self.buffer_alignment);
+        let create_info = AllocationCreateInfo {
+            requirements,
+            allocation_type: AllocationType::Linear,
+            usage: self.memory_usage,
+            allocate_preference: MemoryAllocatePreference::Unknown,
+            dedicated_allocation: Some(DedicatedAllocation::Buffer(&raw_buffer)),
+            ..Default::default()
+        };
+
+        match unsafe { self.memory_allocator.allocate_unchecked(create_info) } {
+            Ok(mut alloc) => {
+                debug_assert!(alloc.offset() % requirements.alignment == 0);
+                debug_assert!(alloc.size() == requirements.size);
+                alloc.shrink(self.arena_size);
+                let inner = Arc::new(
+                    unsafe { raw_buffer.bind_memory_unchecked(alloc) }
+                        .map_err(|(err, _, _)| err)?,
+                );
+
+                Ok(inner)
+            }
+            Err(err) => Err(err),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Arena {
+    inner: ManuallyDrop<Arc<Buffer>>,
+    // Where we return the arena in our `Drop` impl.
+    reserve: Arc<ArrayQueue<Arc<Buffer>>>,
+}
+
+impl Drop for Arena {
+    fn drop(&mut self) {
+        let inner = unsafe { ManuallyDrop::take(&mut self.inner) };
+        let _ = self.reserve.push(inner);
+    }
+}
+
+/// Parameters to create a new [`CpuBufferAllocator`].
+pub struct CpuBufferAllocatorCreateInfo {
+    /// Initial size of an arena in bytes.
+    ///
+    /// Ideally this should fit all the data you need to update per frame. So for example, if you
+    /// need to allocate buffers of size 1K, 2K and 5K each frame, then this should be 8K. If your
+    /// data is dynamically-sized then try to make an educated guess or simply leave the default.
+    ///
+    /// The default value is `0`.
+    pub arena_size: DeviceSize,
+
+    /// The buffer usage that all allocated buffers should have.
+    ///
+    /// The default value is [`BufferUsage::TRANSFER_SRC`].
+    pub buffer_usage: BufferUsage,
+
+    /// The memory usage that all buffers should be allocated with.
+    ///
+    /// Must not be [`MemoryUsage::GpuOnly`].
+    ///
+    /// The default value is [`MemoryUsage::Upload`].
+    pub memory_usage: MemoryUsage,
+
+    pub _ne: crate::NonExhaustive,
+}
+
+impl Default for CpuBufferAllocatorCreateInfo {
+    #[inline]
+    fn default() -> Self {
+        CpuBufferAllocatorCreateInfo {
+            arena_size: 0,
+            buffer_usage: BufferUsage::TRANSFER_SRC,
+            memory_usage: MemoryUsage::Upload,
+            _ne: crate::NonExhaustive(()),
+        }
+    }
+}
+
+/// A subbuffer allocated using a [`CpuBufferAllocator`].
+#[derive(Debug)]
+pub struct CpuSubbuffer<T: ?Sized> {
+    id: NonZeroU64,
+    // Offset within the arena.
+    offset: DeviceSize,
+    // Size of the subbuffer.
+    size: DeviceSize,
+    // We need to keep a reference to the arena so it won't be reset.
+    arena: Arc<Arena>,
+    _marker: PhantomData<Box<T>>,
+}
+
+unsafe impl<T> BufferAccess for CpuSubbuffer<T>
+where
+    T: BufferContents + ?Sized,
+{
+    fn inner(&self) -> BufferInner<'_> {
+        BufferInner {
+            buffer: &self.arena.inner,
+            offset: self.offset,
+        }
+    }
+
+    fn size(&self) -> DeviceSize {
+        self.size
+    }
+}
+
+impl<T> BufferAccessObject for Arc<CpuSubbuffer<T>>
+where
+    T: BufferContents + ?Sized,
+{
+    fn as_buffer_access_object(&self) -> Arc<dyn BufferAccess> {
+        self.clone()
+    }
+}
+
+unsafe impl<T> TypedBufferAccess for CpuSubbuffer<T>
+where
+    T: BufferContents + ?Sized,
+{
+    type Content = T;
+}
+
+unsafe impl<T> DeviceOwned for CpuSubbuffer<T>
+where
+    T: ?Sized,
+{
+    fn device(&self) -> &Arc<Device> {
+        self.arena.inner.device()
+    }
+}
+
+crate::impl_id_counter!(CpuSubbuffer<T>);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn reserve() {
+        let (device, _) = gfx_dev_and_queue!();
+        let memory_allocator = StandardMemoryAllocator::new_default(device);
+
+        let buffer_allocator = CpuBufferAllocator::new(memory_allocator, Default::default());
+        assert_eq!(buffer_allocator.arena_size(), 0);
+
+        buffer_allocator.reserve(83).unwrap();
+        assert_eq!(buffer_allocator.arena_size(), 83);
+    }
+
+    #[test]
+    fn capacity_increase() {
+        let (device, _) = gfx_dev_and_queue!();
+        let memory_allocator = StandardMemoryAllocator::new_default(device);
+
+        let buffer_allocator = CpuBufferAllocator::new(memory_allocator, Default::default());
+        assert_eq!(buffer_allocator.arena_size(), 0);
+
+        buffer_allocator.from_data(12u32).unwrap();
+        assert_eq!(buffer_allocator.arena_size(), 8);
+    }
+}
--- a/vulkano/src/buffer/cpu_pool.rs
+++ b/vulkano/src/buffer/cpu_pool.rs
@ -1,931 +0,0 @@
-// Copyright (c) 2017 The vulkano developers
-// Licensed under the Apache License, Version 2.0
-// <LICENSE-APACHE or
-// https://www.apache.org/licenses/LICENSE-2.0> or the MIT
-// license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
-// at your option. All files in the project carrying such
-// notice may not be copied, modified, or distributed except
-// according to those terms.
-
-use super::{
-    sys::{Buffer, BufferCreateInfo, RawBuffer},
-    BufferAccess, BufferAccessObject, BufferContents, BufferError, BufferInner, BufferUsage,
-    TypedBufferAccess,
-};
-use crate::{
-    buffer::sys::BufferMemory,
-    device::{Device, DeviceOwned},
-    memory::{
-        allocator::{
-            AllocationCreateInfo, AllocationCreationError, AllocationType,
-            MemoryAllocatePreference, MemoryAllocator, MemoryUsage, StandardMemoryAllocator,
-        },
-        DedicatedAllocation,
-    },
-    DeviceSize, VulkanError,
-};
-use std::{
-    hash::{Hash, Hasher},
-    marker::PhantomData,
-    mem::size_of,
-    ptr,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc, Mutex, MutexGuard,
-    },
-};
-
-// TODO: Add `CpuBufferPoolSubbuffer::read` to read the content of a subbuffer.
-//       But that's hard to do because we must prevent `increase_gpu_lock` from working while a
-//       a buffer is locked.
-
-/// Ring buffer from which "sub-buffers" can be individually allocated.
-///
-/// This buffer is especially suitable when you want to upload or download some data regularly
-/// (for example, at each frame for a video game).
-///
-/// # Usage
-///
-/// A `CpuBufferPool` is similar to a ring buffer. You start by creating an empty pool, then you
-/// grab elements from the pool and use them, and if the pool is full it will automatically grow
-/// in size.
-///
-/// Contrary to a `Vec`, elements automatically free themselves when they are dropped (ie. usually
-/// when you call `cleanup_finished()` on a future, or when you drop that future).
-///
-/// # Arc-like
-///
-/// The `CpuBufferPool` struct internally contains an `Arc`. You can clone the `CpuBufferPool` for
-/// a cheap cost, and all the clones will share the same underlying buffer.
-///
-/// # Example
-///
-/// ```
-/// use vulkano::buffer::CpuBufferPool;
-/// use vulkano::command_buffer::AutoCommandBufferBuilder;
-/// use vulkano::command_buffer::CommandBufferUsage;
-/// use vulkano::command_buffer::PrimaryCommandBufferAbstract;
-/// use vulkano::sync::GpuFuture;
-/// # let queue: std::sync::Arc<vulkano::device::Queue> = return;
-/// # let memory_allocator: std::sync::Arc<vulkano::memory::allocator::StandardMemoryAllocator> = return;
-/// # let command_buffer_allocator: vulkano::command_buffer::allocator::StandardCommandBufferAllocator = return;
-///
-/// // Create the ring buffer.
-/// let buffer = CpuBufferPool::upload(memory_allocator);
-///
-/// for n in 0 .. 25u32 {
-///     // Each loop grabs a new entry from that ring buffer and stores ` data` in it.
-///     let data: [f32; 4] = [1.0, 0.5, n as f32 / 24.0, 0.0];
-///     let sub_buffer = buffer.from_data(data).unwrap();
-///
-///     // You can then use `sub_buffer` as if it was an entirely separate buffer.
-///     AutoCommandBufferBuilder::primary(
-///         &command_buffer_allocator,
-///         queue.queue_family_index(),
-///         CommandBufferUsage::OneTimeSubmit,
-///     )
-///     .unwrap()
-///     // For the sake of the example we just call `update_buffer` on the buffer, even though
-///     // it is pointless to do that.
-///     .update_buffer(&[0.2, 0.3, 0.4, 0.5], sub_buffer.clone(), 0)
-///     .unwrap()
-///     .build().unwrap()
-///     .execute(queue.clone())
-///     .unwrap()
-///     .then_signal_fence_and_flush()
-///     .unwrap();
-/// }
-/// ```
-pub struct CpuBufferPool<T, A = StandardMemoryAllocator>
-where
-    [T]: BufferContents,
-    A: MemoryAllocator + ?Sized,
-{
-    // The memory pool to use for allocations.
-    allocator: Arc<A>,
-
-    // Current buffer from which elements are grabbed.
-    current_buffer: Mutex<Option<Arc<ActualBuffer>>>,
-
-    // Buffer usage.
-    buffer_usage: BufferUsage,
-
-    memory_usage: MemoryUsage,
-
-    // Necessary to make it compile.
-    marker: PhantomData<Box<T>>,
-}
-
-// One buffer of the pool.
-#[derive(Debug)]
-struct ActualBuffer {
-    inner: Arc<Buffer>,
-
-    // List of the chunks that are reserved.
-    chunks_in_use: Mutex<Vec<ActualBufferChunk>>,
-
-    // The index of the chunk that should be available next for the ring buffer.
-    next_index: AtomicU64,
-
-    // Number of elements in the buffer.
-    capacity: DeviceSize,
-}
-
-// Access pattern of one subbuffer.
-#[derive(Debug)]
-struct ActualBufferChunk {
-    // First element number within the actual buffer.
-    index: DeviceSize,
-
-    // Number of occupied elements within the actual buffer.
-    len: DeviceSize,
-
-    // Number of `CpuBufferPoolSubbuffer` objects that point to this subbuffer.
-    num_cpu_accesses: usize,
-}
-
-/// A subbuffer allocated from a `CpuBufferPool`.
-///
-/// When this object is destroyed, the subbuffer is automatically reclaimed by the pool.
-pub struct CpuBufferPoolChunk<T>
-where
-    [T]: BufferContents,
-{
-    buffer: Arc<ActualBuffer>,
-
-    // Index of the subbuffer within `buffer`. In number of elements.
-    index: DeviceSize,
-
-    // Number of bytes to add to `index * mem::size_of::<T>()` to obtain the start of the data in
-    // the buffer. Necessary for alignment purposes.
-    align_offset: DeviceSize,
-
-    // Size of the subbuffer in number of elements, as requested by the user.
-    // If this is 0, then no entry was added to `chunks_in_use`.
-    requested_len: DeviceSize,
-
-    // Necessary to make it compile.
-    marker: PhantomData<Box<T>>,
-}
-
-/// A subbuffer allocated from a `CpuBufferPool`.
-///
-/// When this object is destroyed, the subbuffer is automatically reclaimed by the pool.
-pub struct CpuBufferPoolSubbuffer<T>
-where
-    [T]: BufferContents,
-{
-    // This struct is just a wrapper around `CpuBufferPoolChunk`.
-    chunk: CpuBufferPoolChunk<T>,
-}
-
-impl<T, A> CpuBufferPool<T, A>
-where
-    [T]: BufferContents,
-    A: MemoryAllocator + ?Sized,
-{
-    /// Builds a `CpuBufferPool`.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    /// - Panics if `memory_usage` is [`MemoryUsage::GpuOnly`].
-    pub fn new(
-        allocator: Arc<A>,
-        buffer_usage: BufferUsage,
-        memory_usage: MemoryUsage,
-    ) -> CpuBufferPool<T, A> {
-        assert!(size_of::<T>() > 0);
-        assert!(memory_usage != MemoryUsage::GpuOnly);
-
-        CpuBufferPool {
-            allocator,
-            current_buffer: Mutex::new(None),
-            buffer_usage,
-            memory_usage,
-            marker: PhantomData,
-        }
-    }
-
-    /// Builds a `CpuBufferPool` meant for simple uploads.
-    ///
-    /// Shortcut for a pool that can only be used as transfer source and with exclusive queue
-    /// family accesses.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    pub fn upload(allocator: Arc<A>) -> CpuBufferPool<T, A> {
-        CpuBufferPool::new(allocator, BufferUsage::TRANSFER_SRC, MemoryUsage::Upload)
-    }
-
-    /// Builds a `CpuBufferPool` meant for simple downloads.
-    ///
-    /// Shortcut for a pool that can only be used as transfer destination and with exclusive queue
-    /// family accesses.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    pub fn download(allocator: Arc<A>) -> CpuBufferPool<T, A> {
-        CpuBufferPool::new(allocator, BufferUsage::TRANSFER_DST, MemoryUsage::Download)
-    }
-
-    /// Builds a `CpuBufferPool` meant for usage as a uniform buffer.
-    ///
-    /// Shortcut for a pool that can only be used as uniform buffer and with exclusive queue
-    /// family accesses.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    pub fn uniform_buffer(allocator: Arc<A>) -> CpuBufferPool<T, A> {
-        CpuBufferPool::new(allocator, BufferUsage::UNIFORM_BUFFER, MemoryUsage::Upload)
-    }
-
-    /// Builds a `CpuBufferPool` meant for usage as a vertex buffer.
-    ///
-    /// Shortcut for a pool that can only be used as vertex buffer and with exclusive queue
-    /// family accesses.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    pub fn vertex_buffer(allocator: Arc<A>) -> CpuBufferPool<T, A> {
-        CpuBufferPool::new(allocator, BufferUsage::VERTEX_BUFFER, MemoryUsage::Upload)
-    }
-
-    /// Builds a `CpuBufferPool` meant for usage as a indirect buffer.
-    ///
-    /// Shortcut for a pool that can only be used as indirect buffer and with exclusive queue
-    /// family accesses.
-    ///
-    /// # Panics
-    ///
-    /// - Panics if `T` has zero size.
-    pub fn indirect_buffer(allocator: Arc<A>) -> CpuBufferPool<T, A> {
-        CpuBufferPool::new(allocator, BufferUsage::INDIRECT_BUFFER, MemoryUsage::Upload)
-    }
-}
-
-impl<T, A> CpuBufferPool<T, A>
-where
-    [T]: BufferContents,
-    A: MemoryAllocator + ?Sized,
-{
-    /// Returns the current capacity of the pool, in number of elements.
-    pub fn capacity(&self) -> DeviceSize {
-        match *self.current_buffer.lock().unwrap() {
-            None => 0,
-            Some(ref buf) => buf.capacity,
-        }
-    }
-
-    /// Makes sure that the capacity is at least `capacity`. Allocates memory if it is not the
-    /// case.
-    ///
-    /// Since this can involve a memory allocation, an `OomError` can happen.
-    pub fn reserve(&self, capacity: DeviceSize) -> Result<(), AllocationCreationError> {
-        if capacity == 0 {
-            return Ok(());
-        }
-
-        let mut cur_buf = self.current_buffer.lock().unwrap();
-
-        // Check current capacity.
-        match *cur_buf {
-            Some(ref buf) if buf.capacity >= capacity => {
-                return Ok(());
-            }
-            _ => (),
-        };
-
-        self.reset_buf(&mut cur_buf, capacity)
-    }
-
-    /// Grants access to a new subbuffer and puts `data` in it.
-    ///
-    /// If no subbuffer is available (because they are still in use by the GPU), a new buffer will
-    /// automatically be allocated.
-    ///
-    /// > **Note**: You can think of it like a `Vec`. If you insert an element and the `Vec` is not
-    /// > large enough, a new chunk of memory is automatically allocated.
-    pub fn from_data(
-        &self,
-        data: T,
-    ) -> Result<Arc<CpuBufferPoolSubbuffer<T>>, AllocationCreationError> {
-        Ok(Arc::new(CpuBufferPoolSubbuffer {
-            chunk: self.chunk_impl([data].into_iter())?,
-        }))
-    }
-
-    /// Grants access to a new subbuffer and puts all elements of `iter` in it.
-    ///
-    /// If no subbuffer is available (because they are still in use by the GPU), a new buffer will
-    /// automatically be allocated.
-    ///
-    /// > **Note**: You can think of it like a `Vec`. If you insert elements and the `Vec` is not
-    /// > large enough, a new chunk of memory is automatically allocated.
-    ///
-    /// # Panic
-    ///
-    /// Panics if the length of the iterator didn't match the actual number of elements.
-    pub fn from_iter<I>(
-        &self,
-        iter: I,
-    ) -> Result<Arc<CpuBufferPoolChunk<T>>, AllocationCreationError>
-    where
-        I: IntoIterator<Item = T>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        self.chunk_impl(iter.into_iter()).map(Arc::new)
-    }
-
-    fn chunk_impl(
-        &self,
-        data: impl ExactSizeIterator<Item = T>,
-    ) -> Result<CpuBufferPoolChunk<T>, AllocationCreationError> {
-        let mut mutex = self.current_buffer.lock().unwrap();
-
-        let data = match self.try_next_impl(&mut mutex, data) {
-            Ok(n) => return Ok(n),
-            Err(d) => d,
-        };
-
-        let next_capacity = match *mutex {
-            Some(ref b) if (data.len() as DeviceSize) < b.capacity => 2 * b.capacity,
-            _ => 2 * data.len().max(1) as DeviceSize,
-        };
-
-        self.reset_buf(&mut mutex, next_capacity)?;
-
-        match self.try_next_impl(&mut mutex, data) {
-            Ok(n) => Ok(n),
-            Err(_) => unreachable!(),
-        }
-    }
-
-    /// Grants access to a new subbuffer and puts `data` in it.
-    ///
-    /// Returns `None` if no subbuffer is available.
-    ///
-    /// A `CpuBufferPool` is always empty the first time you use it, so you shouldn't use
-    /// `try_next` the first time you use it.
-    pub fn try_next(&self, data: T) -> Option<Arc<CpuBufferPoolSubbuffer<T>>> {
-        let mut mutex = self.current_buffer.lock().unwrap();
-        self.try_next_impl(&mut mutex, [data])
-            .map(|c| Arc::new(CpuBufferPoolSubbuffer { chunk: c }))
-            .ok()
-    }
-
-    // Creates a new buffer and sets it as current. The capacity is in number of elements.
-    //
-    // `cur_buf_mutex` must be an active lock of `self.current_buffer`.
-    fn reset_buf(
-        &self,
-        cur_buf_mutex: &mut MutexGuard<'_, Option<Arc<ActualBuffer>>>,
-        capacity: DeviceSize,
-    ) -> Result<(), AllocationCreationError> {
-        let size = match (size_of::<T>() as DeviceSize).checked_mul(capacity) {
-            Some(s) => s,
-            None => {
-                return Err(AllocationCreationError::VulkanError(
-                    VulkanError::OutOfDeviceMemory,
-                ))
-            }
-        };
-
-        let raw_buffer = RawBuffer::new(
-            self.device().clone(),
-            BufferCreateInfo {
-                size,
-                usage: self.buffer_usage,
-                ..Default::default()
-            },
-        )
-        .map_err(|err| match err {
-            BufferError::AllocError(err) => err,
-            // We don't use sparse-binding, therefore the other errors can't happen.
-            _ => unreachable!(),
-        })?;
-        let requirements = *raw_buffer.memory_requirements();
-        let create_info = AllocationCreateInfo {
-            requirements,
-            allocation_type: AllocationType::Linear,
-            usage: self.memory_usage,
-            allocate_preference: MemoryAllocatePreference::Unknown,
-            dedicated_allocation: Some(DedicatedAllocation::Buffer(&raw_buffer)),
-            ..Default::default()
-        };
-
-        match unsafe { self.allocator.allocate_unchecked(create_info) } {
-            Ok(mut alloc) => {
-                debug_assert!(alloc.offset() % requirements.alignment == 0);
-                debug_assert!(alloc.size() == requirements.size);
-                alloc.shrink(size);
-                let inner = unsafe {
-                    Arc::new(
-                        raw_buffer
-                            .bind_memory_unchecked(alloc)
-                            .map_err(|(err, _, _)| err)?,
-                    )
-                };
-
-                **cur_buf_mutex = Some(Arc::new(ActualBuffer {
-                    inner,
-                    chunks_in_use: Mutex::new(vec![]),
-                    next_index: AtomicU64::new(0),
-                    capacity,
-                }));
-
-                Ok(())
-            }
-            Err(err) => Err(err),
-        }
-    }
-
-    // Tries to lock a subbuffer from the current buffer.
-    //
-    // `cur_buf_mutex` must be an active lock of `self.current_buffer`.
-    //
-    // Returns `data` wrapped inside an `Err` if there is no slot available in the current buffer.
-    //
-    // # Panic
-    //
-    // Panics if the length of the iterator didn't match the actual number of element.
-    fn try_next_impl<I>(
-        &self,
-        cur_buf_mutex: &mut MutexGuard<'_, Option<Arc<ActualBuffer>>>,
-        data: I,
-    ) -> Result<CpuBufferPoolChunk<T>, I::IntoIter>
-    where
-        I: IntoIterator<Item = T>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let mut data = data.into_iter();
-
-        // Grab the current buffer. Return `Err` if the pool wasn't "initialized" yet.
-        let current_buffer = match cur_buf_mutex.clone() {
-            Some(b) => b,
-            None => return Err(data),
-        };
-
-        let mut chunks_in_use = current_buffer.chunks_in_use.lock().unwrap();
-        debug_assert!(!chunks_in_use.iter().any(|c| c.len == 0));
-
-        // Number of elements requested by the user.
-        let requested_len = data.len() as DeviceSize;
-
-        // We special case when 0 elements are requested. Polluting the list of allocated chunks
-        // with chunks of length 0 means that we will have troubles deallocating.
-        if requested_len == 0 {
-            assert!(
-                data.next().is_none(),
-                "Expected iterator passed to CpuBufferPool::chunk to be empty"
-            );
-            return Ok(CpuBufferPoolChunk {
-                // TODO: remove .clone() once non-lexical borrows land
-                buffer: current_buffer.clone(),
-                index: 0,
-                align_offset: 0,
-                requested_len: 0,
-                marker: PhantomData,
-            });
-        }
-
-        // Find a suitable offset and len, or returns if none available.
-        let (index, occupied_len, align_offset) = {
-            let (tentative_index, tentative_len, tentative_align_offset) = {
-                // Since the only place that touches `next_index` is this code, and since we
-                // own a mutex lock to the buffer, it means that `next_index` can't be accessed
-                // concurrently.
-                // TODO: ^ eventually should be put inside the mutex
-                let idx = current_buffer.next_index.load(Ordering::SeqCst);
-
-                // Find the required alignment in bytes.
-                let align_uniform = if self.buffer_usage.intersects(BufferUsage::UNIFORM_BUFFER) {
-                    self.device()
-                        .physical_device()
-                        .properties()
-                        .min_uniform_buffer_offset_alignment
-                } else {
-                    1
-                };
-                let align_storage = if self.buffer_usage.intersects(BufferUsage::STORAGE_BUFFER) {
-                    self.device()
-                        .physical_device()
-                        .properties()
-                        .min_storage_buffer_offset_alignment
-                } else {
-                    1
-                };
-                let align_bytes = align_uniform.max(align_storage);
-
-                let tentative_align_offset = (align_bytes
-                    - ((idx * size_of::<T>() as DeviceSize) % align_bytes))
-                    % align_bytes;
-                let additional_len = if tentative_align_offset == 0 {
-                    0
-                } else {
-                    1 + (tentative_align_offset - 1) / size_of::<T>() as DeviceSize
-                };
-
-                (idx, requested_len + additional_len, tentative_align_offset)
-            };
-
-            // Find out whether any chunk in use overlaps this range.
-            if tentative_index + tentative_len <= current_buffer.capacity
-                && !chunks_in_use.iter().any(|c| {
-                    (c.index >= tentative_index && c.index < tentative_index + tentative_len)
-                        || (c.index <= tentative_index && c.index + c.len > tentative_index)
-                })
-            {
-                (tentative_index, tentative_len, tentative_align_offset)
-            } else {
-                // Impossible to allocate at `tentative_index`. Let's try 0 instead.
-                if requested_len <= current_buffer.capacity
-                    && !chunks_in_use.iter().any(|c| c.index < requested_len)
-                {
-                    (0, requested_len, 0)
-                } else {
-                    // Buffer is full. Return.
-                    return Err(data);
-                }
-            }
-        };
-
-        // Write `data` in the memory.
-        unsafe {
-            let range = (index * size_of::<T>() as DeviceSize + align_offset)
-                ..((index + requested_len) * size_of::<T>() as DeviceSize + align_offset);
-
-            let allocation = match current_buffer.inner.memory() {
-                BufferMemory::Normal(a) => a,
-                BufferMemory::Sparse => unreachable!(),
-            };
-
-            let bytes = allocation.write(range.clone()).unwrap();
-            let mapping = <[T]>::from_bytes_mut(bytes).unwrap();
-
-            let mut written = 0;
-            for (o, i) in mapping.iter_mut().zip(data) {
-                ptr::write(o, i);
-                written += 1;
-            }
-
-            allocation.flush_range(range).unwrap();
-
-            assert_eq!(
-                written, requested_len,
-                "Iterator passed to CpuBufferPool::chunk has a mismatch between reported \
-                length and actual number of elements"
-            );
-        }
-
-        // Mark the chunk as in use.
-        current_buffer
-            .next_index
-            .store(index + occupied_len, Ordering::SeqCst);
-        chunks_in_use.push(ActualBufferChunk {
-            index,
-            len: occupied_len,
-            num_cpu_accesses: 1,
-        });
-
-        Ok(CpuBufferPoolChunk {
-            // TODO: remove .clone() once non-lexical borrows land
-            buffer: current_buffer.clone(),
-            index,
-            align_offset,
-            requested_len,
-            marker: PhantomData,
-        })
-    }
-}
-
-// Can't automatically derive `Clone`, otherwise the compiler adds a `T: Clone` requirement.
-impl<T, A> Clone for CpuBufferPool<T, A>
-where
-    [T]: BufferContents,
-    A: MemoryAllocator + ?Sized,
-{
-    fn clone(&self) -> Self {
-        let buf = self.current_buffer.lock().unwrap();
-
-        CpuBufferPool {
-            allocator: self.allocator.clone(),
-            current_buffer: Mutex::new(buf.clone()),
-            buffer_usage: self.buffer_usage,
-            memory_usage: self.memory_usage,
-            marker: PhantomData,
-        }
-    }
-}
-
-unsafe impl<T, A> DeviceOwned for CpuBufferPool<T, A>
-where
-    [T]: BufferContents,
-    A: MemoryAllocator + ?Sized,
-{
-    fn device(&self) -> &Arc<Device> {
-        self.allocator.device()
-    }
-}
-
-impl<T> Clone for CpuBufferPoolChunk<T>
-where
-    [T]: BufferContents,
-{
-    fn clone(&self) -> CpuBufferPoolChunk<T> {
-        let mut chunks_in_use_lock = self.buffer.chunks_in_use.lock().unwrap();
-        let chunk = chunks_in_use_lock
-            .iter_mut()
-            .find(|c| c.index == self.index)
-            .unwrap();
-
-        debug_assert!(chunk.num_cpu_accesses >= 1);
-        chunk.num_cpu_accesses = chunk
-            .num_cpu_accesses
-            .checked_add(1)
-            .expect("Overflow in CPU accesses");
-
-        CpuBufferPoolChunk {
-            buffer: self.buffer.clone(),
-            index: self.index,
-            align_offset: self.align_offset,
-            requested_len: self.requested_len,
-            marker: PhantomData,
-        }
-    }
-}
-
-unsafe impl<T> BufferAccess for CpuBufferPoolChunk<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn inner(&self) -> BufferInner<'_> {
-        BufferInner {
-            buffer: &self.buffer.inner,
-            offset: self.index * size_of::<T>() as DeviceSize + self.align_offset,
-        }
-    }
-
-    fn size(&self) -> DeviceSize {
-        self.requested_len * size_of::<T>() as DeviceSize
-    }
-}
-
-impl<T> BufferAccessObject for Arc<CpuBufferPoolChunk<T>>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn as_buffer_access_object(&self) -> Arc<dyn BufferAccess> {
-        self.clone()
-    }
-}
-
-impl<T> Drop for CpuBufferPoolChunk<T>
-where
-    [T]: BufferContents,
-{
-    fn drop(&mut self) {
-        // If `requested_len` is 0, then no entry was added in the chunks.
-        if self.requested_len == 0 {
-            return;
-        }
-
-        let mut chunks_in_use_lock = self.buffer.chunks_in_use.lock().unwrap();
-        let chunk_num = chunks_in_use_lock
-            .iter_mut()
-            .position(|c| c.index == self.index)
-            .unwrap();
-
-        if chunks_in_use_lock[chunk_num].num_cpu_accesses >= 2 {
-            chunks_in_use_lock[chunk_num].num_cpu_accesses -= 1;
-        } else {
-            chunks_in_use_lock.remove(chunk_num);
-        }
-    }
-}
-
-unsafe impl<T> TypedBufferAccess for CpuBufferPoolChunk<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    type Content = [T];
-}
-
-unsafe impl<T> DeviceOwned for CpuBufferPoolChunk<T>
-where
-    [T]: BufferContents,
-{
-    fn device(&self) -> &Arc<Device> {
-        self.buffer.inner.device()
-    }
-}
-
-impl<T> PartialEq for CpuBufferPoolChunk<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.inner() == other.inner() && self.size() == other.size()
-    }
-}
-
-impl<T> Eq for CpuBufferPoolChunk<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-}
-
-impl<T> Hash for CpuBufferPoolChunk<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.inner().hash(state);
-        self.size().hash(state);
-    }
-}
-
-impl<T> Clone for CpuBufferPoolSubbuffer<T>
-where
-    [T]: BufferContents,
-{
-    fn clone(&self) -> CpuBufferPoolSubbuffer<T> {
-        CpuBufferPoolSubbuffer {
-            chunk: self.chunk.clone(),
-        }
-    }
-}
-
-unsafe impl<T> BufferAccess for CpuBufferPoolSubbuffer<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn inner(&self) -> BufferInner<'_> {
-        self.chunk.inner()
-    }
-
-    fn size(&self) -> DeviceSize {
-        self.chunk.size()
-    }
-}
-
-impl<T> BufferAccessObject for Arc<CpuBufferPoolSubbuffer<T>>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn as_buffer_access_object(&self) -> Arc<dyn BufferAccess> {
-        self.clone()
-    }
-}
-
-unsafe impl<T> TypedBufferAccess for CpuBufferPoolSubbuffer<T>
-where
-    T: BufferContents,
-    [T]: BufferContents,
-{
-    type Content = T;
-}
-
-unsafe impl<T> DeviceOwned for CpuBufferPoolSubbuffer<T>
-where
-    [T]: BufferContents,
-{
-    fn device(&self) -> &Arc<Device> {
-        self.chunk.buffer.inner.device()
-    }
-}
-
-impl<T> PartialEq for CpuBufferPoolSubbuffer<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.inner() == other.inner() && self.size() == other.size()
-    }
-}
-
-impl<T> Eq for CpuBufferPoolSubbuffer<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-}
-
-impl<T> Hash for CpuBufferPoolSubbuffer<T>
-where
-    T: Send + Sync,
-    [T]: BufferContents,
-{
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.inner().hash(state);
-        self.size().hash(state);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::mem;
-
-    #[test]
-    fn basic_create() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-        let _ = CpuBufferPool::<u8>::upload(memory_allocator);
-    }
-
-    #[test]
-    fn reserve() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-
-        let pool = CpuBufferPool::<u8>::upload(memory_allocator);
-        assert_eq!(pool.capacity(), 0);
-
-        pool.reserve(83).unwrap();
-        assert_eq!(pool.capacity(), 83);
-    }
-
-    #[test]
-    fn capacity_increase() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-
-        let pool = CpuBufferPool::upload(memory_allocator);
-        assert_eq!(pool.capacity(), 0);
-
-        pool.from_data(12).unwrap();
-        let first_cap = pool.capacity();
-        assert!(first_cap >= 1);
-
-        for _ in 0..first_cap + 5 {
-            mem::forget(pool.from_data(12).unwrap());
-        }
-
-        assert!(pool.capacity() > first_cap);
-    }
-
-    #[test]
-    fn reuse_subbuffers() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-
-        let pool = CpuBufferPool::upload(memory_allocator);
-        assert_eq!(pool.capacity(), 0);
-
-        let mut capacity = None;
-        for _ in 0..64 {
-            pool.from_data(12).unwrap();
-
-            let new_cap = pool.capacity();
-            assert!(new_cap >= 1);
-            match capacity {
-                None => capacity = Some(new_cap),
-                Some(c) => assert_eq!(c, new_cap),
-            }
-        }
-    }
-
-    #[test]
-    fn chunk_loopback() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-
-        let pool = CpuBufferPool::<u8>::upload(memory_allocator);
-        pool.reserve(5).unwrap();
-
-        let a = pool.from_iter(vec![0, 0]).unwrap();
-        let b = pool.from_iter(vec![0, 0]).unwrap();
-        assert_eq!(b.index, 2);
-        drop(a);
-
-        let c = pool.from_iter(vec![0, 0]).unwrap();
-        assert_eq!(c.index, 0);
-
-        assert_eq!(pool.capacity(), 5);
-    }
-
-    #[test]
-    fn chunk_0_elems_doesnt_pollute() {
-        let (device, _) = gfx_dev_and_queue!();
-        let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device));
-
-        let pool = CpuBufferPool::<u8>::upload(memory_allocator);
-
-        let _ = pool.from_iter(vec![]).unwrap();
-        let _ = pool.from_iter(vec![0, 0]).unwrap();
-    }
-}
--- a/vulkano/src/buffer/mod.rs
+++ b/vulkano/src/buffer/mod.rs
@ -14,11 +14,12 @@
 //! between a Vulkan buffer and a regular buffer is that the content of a Vulkan buffer is
 //! accessible from the GPU.
 //!
-//! Vulkano does not perform any specific marshalling of buffer data. The representation of the buffer in
-//! memory is identical between the CPU and GPU. Because the Rust compiler is allowed to reorder struct
-//! fields at will by default when using `#[repr(Rust)]`, it is advised to mark each struct requiring
-//! imput assembly as `#[repr(C)]`. This forces Rust to follow the standard C procedure. Each element is
-//! laid out in memory in the order of declaration and aligned to a multiple of their alignment.
+//! Vulkano does not perform any specific marshalling of buffer data. The representation of the
+//! buffer in memory is identical between the CPU and GPU. Because the Rust compiler is allowed to
+//! reorder struct fields at will by default when using `#[repr(Rust)]`, it is advised to mark each
+//! struct requiring imput assembly as `#[repr(C)]`. This forces Rust to follow the standard C
+//! procedure. Each element is laid out in memory in the order of declaration and aligned to a
+//! multiple of their alignment.
 //!
 //! # Various kinds of buffers
 //!
@ -28,26 +29,24 @@
 //! Instead you are encouraged to use one of the high-level wrappers that vulkano provides. Which
 //! wrapper to use depends on the way you are going to use the buffer:
 //!
-//! - A [`DeviceLocalBuffer`](crate::buffer::device_local::DeviceLocalBuffer) designates a buffer
-//!   usually located in video memory and whose content can't be directly accessed by your
-//!   application. Accessing this buffer from the GPU is generally faster compared to accessing a
-//!   CPU-accessible buffer.
-//! - A [`CpuBufferPool`](crate::buffer::cpu_pool::CpuBufferPool) is a ring buffer that can be used to
-//!   transfer data between the CPU and the GPU at a high rate.
-//! - A [`CpuAccessibleBuffer`](crate::buffer::cpu_access::CpuAccessibleBuffer) is a simple buffer that
-//!   can be used to prototype.
+//! - A [`DeviceLocalBuffer`] designates a buffer usually located in video memory and whose content
+//!   can't be directly accessed by your application. Accessing this buffer from the GPU is
+//!   generally faster compared to accessing a CPU-accessible buffer.
+//! - A [`CpuBufferAllocator`] can be used to transfer data between the CPU and the GPU at a high
+//!   rate.
+//! - A [`CpuAccessibleBuffer`] is a simple buffer that can be used to prototype.
 //!
-//! Here is a quick way to choose which buffer to use. Do you often need to read or write
-//! the content of the buffer? If so, use a `CpuBufferPool`. Otherwise, do you need to have access
+//! Here is a quick way to choose which buffer to use. Do you often need to read or write the
+//! content of the buffer? If so, use a `CpuBufferAllocator`. Otherwise, do you need to have access
 //! to the buffer on the CPU? Then use `CpuAccessibleBuffer`. Otherwise, use a `DeviceLocalBuffer`.
 //!
-//! Another example: if a buffer is under constant access by the GPU but you need to
-//! read its content on the CPU from time to time, it may be a good idea to use a
-//! `DeviceLocalBuffer` as the main buffer and a `CpuBufferPool` for when you need to read it.
-//! Then whenever you need to read the main buffer, ask the GPU to copy from the device-local
-//! buffer to the CPU buffer pool, and read the CPU buffer pool instead.
+//! Another example: if a buffer is under constant access by the GPU but you need to read its
+//! content on the CPU from time to time, it may be a good idea to use a `DeviceLocalBuffer` as the
+//! main buffer and a `CpuAccessibleBuffer` for when you need to read it. Then whenever you need to
+//! read the main buffer, ask the GPU to copy from the device-local buffer to the CPU-accessible
+//! buffer, and read the CPU-accessible buffer instead.
 //!
-//! # Buffers usage
+//! # Buffer usage
 //!
 //! When you create a buffer object, you have to specify its *usage*. In other words, you have to
 //! specify the way it is going to be used. Trying to use a buffer in a way that wasn't specified
@ -64,18 +63,18 @@
 //!
 //! - As a uniform buffer. Uniform buffers are read-only.
 //! - As a storage buffer. Storage buffers can be read and written.
-//! - As a uniform texel buffer. Contrary to a uniform buffer, the data is interpreted by the
-//!   GPU and can be for example normalized.
+//! - As a uniform texel buffer. Contrary to a uniform buffer, the data is interpreted by the GPU
+//!   and can be for example normalized.
 //! - As a storage texel buffer. Additionally, some data formats can be modified with atomic
 //!   operations.
 //!
 //! Using uniform/storage texel buffers requires creating a *buffer view*. See the `view` module
 //! for how to create a buffer view.
 //!
+//! [`CpuBufferAllocator`]: allocator::CpuBufferAllocator

 pub use self::{
    cpu_access::CpuAccessibleBuffer,
-    cpu_pool::CpuBufferPool,
    device_local::DeviceLocalBuffer,
    slice::BufferSlice,
    sys::BufferError,
@ -95,8 +94,8 @@ use bytemuck::{
 };
 use std::mem::size_of;

+pub mod allocator;
 pub mod cpu_access;
-pub mod cpu_pool;
 pub mod device_local;
 pub mod sys;
 pub mod view;
@ -164,7 +163,7 @@ pub unsafe trait BufferContents: Send + Sync + 'static {
    /// Converts an immutable reference to `Self` to an immutable byte slice.
    fn as_bytes(&self) -> &[u8];

-    /// Converts a mutable reference to `Self` to an mutable byte slice.
+    /// Converts a mutable reference to `Self` to a mutable byte slice.
    fn as_bytes_mut(&mut self) -> &mut [u8];

    /// Converts an immutable byte slice into an immutable reference to `Self`.
--- a/vulkano/src/memory/allocator/mod.rs
+++ b/vulkano/src/memory/allocator/mod.rs
@ -1624,6 +1624,16 @@ impl From<RequirementNotMet> for GenericMemoryAllocatorCreationError {
    }
 }

+pub(crate) fn align_up(val: DeviceSize, alignment: DeviceSize) -> DeviceSize {
+    align_down(val + alignment - 1, alignment)
+}
+
+pub(crate) fn align_down(val: DeviceSize, alignment: DeviceSize) -> DeviceSize {
+    debug_assert!(alignment.is_power_of_two());
+
+    val & !(alignment - 1)
+}
+
 mod array_vec {
    use std::ops::{Deref, DerefMut};

--- a/vulkano/src/memory/allocator/suballocator.rs
+++ b/vulkano/src/memory/allocator/suballocator.rs
@ -14,7 +14,9 @@
 //! [the parent module]: super

 use self::host::SlotId;
-use super::{array_vec::ArrayVec, AllocationCreateInfo, AllocationCreationError};
+use super::{
+    align_down, align_up, array_vec::ArrayVec, AllocationCreateInfo, AllocationCreationError,
+};
 use crate::{
    device::{Device, DeviceOwned},
    image::ImageTiling,
@ -205,6 +207,10 @@ impl MemoryAlloc {
        })
    }

+    pub(crate) fn atom_size(&self) -> Option<NonZeroU64> {
+        self.atom_size
+    }
+
    /// Invalidates the host (CPU) cache for a range of the allocation.
    ///
    /// You must call this method before the memory is read by the host, if the device previously
@ -239,8 +245,7 @@ impl MemoryAlloc {
                .result()
                .map_err(VulkanError::from)?;
        } else {
-            // FIXME:
-            // self.debug_validate_memory_range(&range);
+            self.debug_validate_memory_range(&range);
        }

        Ok(())
@ -280,8 +285,7 @@ impl MemoryAlloc {
                .result()
                .map_err(VulkanError::from)?;
        } else {
-            // FIXME:
-            // self.debug_validate_memory_range(&range);
+            self.debug_validate_memory_range(&range);
        }

        Ok(())
@ -330,18 +334,22 @@ impl MemoryAlloc {
    /// This exists because even if no cache control is required, the parameters should still be
    /// valid, otherwise you might have bugs in your code forever just because your memory happens
    /// to be host-coherent.
-    #[allow(dead_code)]
    fn debug_validate_memory_range(&self, range: &Range<DeviceSize>) {
        debug_assert!(!range.is_empty() && range.end <= self.size);
-        debug_assert!({
-            let atom_size = self
-                .device()
-                .physical_device()
-                .properties()
-                .non_coherent_atom_size;
+        debug_assert!(
+            {
+                let atom_size = self
+                    .device()
+                    .physical_device()
+                    .properties()
+                    .non_coherent_atom_size;

-            range.start % atom_size == 0 && (range.end % atom_size == 0 || range.end == self.size)
-        });
+                range.start % atom_size == 0
+                    && (range.end % atom_size == 0 || range.end == self.size)
+            },
+            "attempted to invalidate or flush a memory range that is not aligned to the \
+            non-coherent atom size",
+        );
    }

    /// Returns the underlying block of [`DeviceMemory`].
@ -925,17 +933,17 @@ impl Display for SuballocationCreationError {
 /// });
 /// ```
 ///
-/// For use in allocating buffers for [`CpuBufferPool`]:
+/// For use in allocating arenas for [`CpuBufferAllocator`]:
 ///
 /// ```
 /// use std::sync::Arc;
-/// use vulkano::buffer::CpuBufferPool;
+/// use vulkano::buffer::allocator::CpuBufferAllocator;
 /// use vulkano::memory::allocator::StandardMemoryAllocator;
 /// # let device: std::sync::Arc<vulkano::device::Device> = return;
 ///
 /// // We need to wrap the allocator in an `Arc` so that we can share ownership of it.
 /// let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone()));
-/// let buffer_pool = CpuBufferPool::<u32>::upload(memory_allocator.clone());
+/// let buffer_allocator = CpuBufferAllocator::new(memory_allocator.clone(), Default::default());
 ///
 /// // You can continue using `memory_allocator` for other things.
 /// ```
@ -978,7 +986,7 @@ impl Display for SuballocationCreationError {
 /// [alignment requirements]: super#alignment
 /// [`GenericMemoryAllocator`]: super::GenericMemoryAllocator
 /// [`StandardMemoryAllocator`]: super::StandardMemoryAllocator
-/// [`CpuBufferPool`]: crate::buffer::CpuBufferPool
+/// [`CpuBufferAllocator`]: crate::buffer::allocator::CpuBufferAllocator
 #[derive(Debug)]
 pub struct FreeListAllocator {
    region: MemoryAlloc,
@ -2430,16 +2438,6 @@ impl Display for BumpAllocatorResetError {
    }
 }

-fn align_up(val: DeviceSize, alignment: DeviceSize) -> DeviceSize {
-    align_down(val + alignment - 1, alignment)
-}
-
-fn align_down(val: DeviceSize, alignment: DeviceSize) -> DeviceSize {
-    debug_assert!(alignment.is_power_of_two());
-
-    val & !(alignment - 1)
-}
-
 /// Checks if resouces A and B share a page.
 ///
 /// > **Note**: Assumes `a_offset + a_size > 0` and `a_offset + a_size <= b_offset`.