[hal/vk] Rework Submission and Surface Synchronization (#5681)

Fix two major synchronization issues in `wgpu_val::vulkan`: - Properly order queue command buffer submissions. Due to Mesa bugs, two semaphores are required even though the Vulkan spec says that only one should be necessary. - Properly manage surface texture acquisition and presentation: - Acquiring a surface texture can return while the presentation engine is still displaying the texture. Applications must wait for a semaphore to be signaled before using the acquired texture. - Presenting a surface texture requires a semaphore to ensure that drawing is complete before presentation occurs. Co-authored-by: Jim Blandy <jimb@red-bean.com>
2024-11-21 22:33:49 +00:00 · 2024-05-30 16:53:34 -04:00 · 2024-05-30 16:53:34 -04:00 · c7458638d1
commit c7458638d1
parent 9b7a965667
18 changed files with 698 additions and 303 deletions
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@ -1499,7 +1499,7 @@ impl Global {
                    .raw
                    .as_ref()
                    .unwrap()
-                    .submit(&refs, &submit_surface_textures, Some((fence, submit_index)))
+                    .submit(&refs, &submit_surface_textures, (fence, submit_index))
                    .map_err(DeviceError::from)?;
            }

--- a/wgpu-core/src/present.rs
+++ b/wgpu-core/src/present.rs
@ -154,17 +154,20 @@ impl Global {
                parent_id: surface_id,
            });
        }
-        #[cfg(not(feature = "trace"))]
-        let _ = device;
+
+        let fence_guard = device.fence.read();
+        let fence = fence_guard.as_ref().unwrap();

        let suf = A::surface_as_hal(surface.as_ref());
        let (texture_id, status) = match unsafe {
-            suf.unwrap()
-                .acquire_texture(Some(std::time::Duration::from_millis(
-                    FRAME_TIMEOUT_MS as u64,
-                )))
+            suf.unwrap().acquire_texture(
+                Some(std::time::Duration::from_millis(FRAME_TIMEOUT_MS as u64)),
+                fence,
+            )
        } {
            Ok(Some(ast)) => {
+                drop(fence_guard);
+
                let texture_desc = wgt::TextureDescriptor {
                    label: (),
                    size: wgt::Extent3d {
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@ -22,7 +22,6 @@ const MAX_BUNNIES: usize = 1 << 20;
 const BUNNY_SIZE: f32 = 0.15 * 256.0;
 const GRAVITY: f32 = -9.8 * 100.0;
 const MAX_VELOCITY: f32 = 750.0;
-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;

 #[repr(C)]
@ -498,7 +497,7 @@ impl<A: hal::Api> Example<A> {
            let mut fence = device.create_fence().unwrap();
            let init_cmd = cmd_encoder.end_encoding().unwrap();
            queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                .unwrap();
            device.wait(&fence, init_fence_value, !0).unwrap();
            device.destroy_buffer(staging_buffer);
@ -550,7 +549,7 @@ impl<A: hal::Api> Example<A> {
            {
                let ctx = &mut self.contexts[self.context_index];
                self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                    .unwrap();
            }

@ -650,7 +649,13 @@ impl<A: hal::Api> Example<A> {

        let ctx = &mut self.contexts[self.context_index];

-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };

        let target_barrier0 = hal::TextureBarrier {
            texture: surface_tex.borrow(),
@ -718,7 +723,6 @@ impl<A: hal::Api> Example<A> {
        }

        ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;

        let target_barrier1 = hal::TextureBarrier {
            texture: surface_tex.borrow(),
@ -732,20 +736,18 @@ impl<A: hal::Api> Example<A> {

        unsafe {
            let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
            self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                .unwrap();
            self.queue.present(&self.surface, surface_tex).unwrap();
            ctx.used_cmd_bufs.push(cmd_buf);
            ctx.used_views.push(surface_tex_view);
        };

-        if do_fence {
        log::debug!("Context switch from {}", self.context_index);
        let old_fence_value = ctx.fence_value;
        if self.contexts.len() == 1 {
@ -771,7 +773,6 @@ impl<A: hal::Api> Example<A> {
        }
        next.fence_value = old_fence_value + 1;
    }
-    }
 }

 cfg_if::cfg_if! {
--- a/wgpu-hal/examples/raw-gles.rs
+++ b/wgpu-hal/examples/raw-gles.rs
@ -156,6 +156,7 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
            })
            .unwrap()
    };
+    let mut fence = unsafe { od.device.create_fence().unwrap() };
    let rp_desc = hal::RenderPassDescriptor {
        label: None,
        extent: wgt::Extent3d {
@ -183,6 +184,6 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
        encoder.begin_render_pass(&rp_desc);
        encoder.end_render_pass();
        let cmd_buf = encoder.end_encoding().unwrap();
-        od.queue.submit(&[&cmd_buf], &[], None).unwrap();
+        od.queue.submit(&[&cmd_buf], &[], (&mut fence, 0)).unwrap();
    }
 }
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@ -13,7 +13,6 @@ use std::{
 };
 use winit::window::WindowButtons;

-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;

 /// [D3D12_RAYTRACING_INSTANCE_DESC](https://microsoft.github.io/DirectX-Specs/d3d/Raytracing.html#d3d12_raytracing_instance_desc)
@ -759,7 +758,7 @@ impl<A: hal::Api> Example<A> {
            let mut fence = device.create_fence().unwrap();
            let init_cmd = cmd_encoder.end_encoding().unwrap();
            queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                .unwrap();
            device.wait(&fence, init_fence_value, !0).unwrap();
            cmd_encoder.reset_all(iter::once(init_cmd));
@ -808,7 +807,13 @@ impl<A: hal::Api> Example<A> {
    fn render(&mut self) {
        let ctx = &mut self.contexts[self.context_index];

-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };

        let target_barrier0 = hal::TextureBarrier {
            texture: surface_tex.borrow(),
@ -909,7 +914,6 @@ impl<A: hal::Api> Example<A> {
        }

        ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;

        let target_barrier1 = hal::TextureBarrier {
            texture: surface_tex.borrow(),
@ -959,20 +963,18 @@ impl<A: hal::Api> Example<A> {

        unsafe {
            let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
            self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                .unwrap();
            self.queue.present(&self.surface, surface_tex).unwrap();
            ctx.used_cmd_bufs.push(cmd_buf);
            ctx.used_views.push(surface_tex_view);
        };

-        if do_fence {
        log::info!("Context switch from {}", self.context_index);
        let old_fence_value = ctx.fence_value;
        if self.contexts.len() == 1 {
@ -998,14 +1000,13 @@ impl<A: hal::Api> Example<A> {
        }
        next.fence_value = old_fence_value + 1;
    }
-    }

    fn exit(mut self) {
        unsafe {
            {
                let ctx = &mut self.contexts[self.context_index];
                self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                    .unwrap();
            }

--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@ -857,6 +857,7 @@ impl crate::Surface for Surface {
    unsafe fn acquire_texture(
        &self,
        timeout: Option<std::time::Duration>,
+        _fence: &Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
        let mut swapchain = self.swap_chain.write();
        let sc = swapchain.as_mut().unwrap();
@ -895,7 +896,7 @@ impl crate::Queue for Queue {
        &self,
        command_buffers: &[&CommandBuffer],
        _surface_textures: &[&Texture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
    ) -> Result<(), crate::DeviceError> {
        let mut temp_lists = self.temp_lists.lock();
        temp_lists.clear();
@ -908,11 +909,9 @@ impl crate::Queue for Queue {
            self.raw.execute_command_lists(&temp_lists);
        }

-        if let Some((fence, value)) = signal_fence {
        self.raw
-                .signal(&fence.raw, value)
+            .signal(&signal_fence.raw, signal_value)
            .into_device_result("Signal fence")?;
-        }

        // Note the lack of synchronization here between the main Direct queue
        // and the dedicated presentation queue. This is automatically handled
--- a/wgpu-hal/src/empty.rs
+++ b/wgpu-hal/src/empty.rs
@ -75,6 +75,7 @@ impl crate::Surface for Context {
    unsafe fn acquire_texture(
        &self,
        timeout: Option<std::time::Duration>,
+        fence: &Resource,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
        Ok(None)
    }
@ -114,7 +115,7 @@ impl crate::Queue for Context {
        &self,
        command_buffers: &[&Resource],
        surface_textures: &[&Resource],
-        signal_fence: Option<(&mut Resource, crate::FenceValue)>,
+        signal_fence: (&mut Resource, crate::FenceValue),
    ) -> DeviceResult<()> {
        Ok(())
    }
--- a/wgpu-hal/src/gles/egl.rs
+++ b/wgpu-hal/src/gles/egl.rs
@ -1432,6 +1432,7 @@ impl crate::Surface for Surface {
    unsafe fn acquire_texture(
        &self,
        _timeout_ms: Option<Duration>, //TODO
+        _fence: &super::Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
        let swapchain = self.swapchain.read();
        let sc = swapchain.as_ref().unwrap();
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@ -1740,7 +1740,7 @@ impl crate::Queue for super::Queue {
        &self,
        command_buffers: &[&super::CommandBuffer],
        _surface_textures: &[&super::Texture],
-        signal_fence: Option<(&mut super::Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut super::Fence, crate::FenceValue),
    ) -> Result<(), crate::DeviceError> {
        let shared = Arc::clone(&self.shared);
        let gl = &shared.context.lock();
@ -1774,12 +1774,10 @@ impl crate::Queue for super::Queue {
            }
        }

-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(gl);
+        signal_fence.maintain(gl);
        let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) }
            .map_err(|_| crate::DeviceError::OutOfMemory)?;
-            fence.pending.push((value, sync));
-        }
+        signal_fence.pending.push((signal_value, sync));

        Ok(())
    }
--- a/wgpu-hal/src/gles/web.rs
+++ b/wgpu-hal/src/gles/web.rs
@ -427,6 +427,7 @@ impl crate::Surface for Surface {
    unsafe fn acquire_texture(
        &self,
        _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
        let swapchain = self.swapchain.read();
        let sc = swapchain.as_ref().unwrap();
--- a/wgpu-hal/src/gles/wgl.rs
+++ b/wgpu-hal/src/gles/wgl.rs
@ -798,6 +798,7 @@ impl crate::Surface for Surface {
    unsafe fn acquire_texture(
        &self,
        _timeout_ms: Option<Duration>,
+        _fence: &super::Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
        let swapchain = self.swapchain.read();
        let sc = swapchain.as_ref().unwrap();
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@ -459,44 +459,101 @@ pub trait Instance: Sized + WasmNotSendSync {
 pub trait Surface: WasmNotSendSync {
    type A: Api;

-    /// Configures the surface to use the given device.
+    /// Configure `self` to use `device`.
    ///
    /// # Safety
    ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work using `self` must have been completed.
    /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
    /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - All surfaces created using other devices must have been unconfigured before this call.
+    /// - The surface `self` must not currently be configured to use any other [`Device`].
    unsafe fn configure(
        &self,
        device: &<Self::A as Api>::Device,
        config: &SurfaceConfiguration,
    ) -> Result<(), SurfaceError>;

-    /// Unconfigures the surface on the given device.
+    /// Unconfigure `self` on `device`.
    ///
    /// # Safety
    ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work that uses `surface` must have been completed.
    /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
    /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - The surface must have been configured on the given device.
+    /// - The surface `self` must have been configured on `device`.
    unsafe fn unconfigure(&self, device: &<Self::A as Api>::Device);

-    /// Returns the next texture to be presented by the swapchain for drawing
+    /// Return the next texture to be presented by `self`, for the caller to draw on.
    ///
-    /// A `timeout` of `None` means to wait indefinitely, with no timeout.
+    /// On success, return an [`AcquiredSurfaceTexture`] representing the
+    /// texture into which the caller should draw the image to be displayed on
+    /// `self`.
+    ///
+    /// If `timeout` elapses before `self` has a texture ready to be acquired,
+    /// return `Ok(None)`. If `timeout` is `None`, wait indefinitely, with no
+    /// timeout.
+    ///
+    /// # Using an [`AcquiredSurfaceTexture`]
+    ///
+    /// On success, this function returns an [`AcquiredSurfaceTexture`] whose
+    /// [`texture`] field is a [`SurfaceTexture`] from which the caller can
+    /// [`borrow`] a [`Texture`] to draw on. The [`AcquiredSurfaceTexture`] also
+    /// carries some metadata about that [`SurfaceTexture`].
+    ///
+    /// All calls to [`Queue::submit`] that draw on that [`Texture`] must also
+    /// include the [`SurfaceTexture`] in the `surface_textures` argument.
+    ///
+    /// When you are done drawing on the texture, you can display it on `self`
+    /// by passing the [`SurfaceTexture`] and `self` to [`Queue::present`].
+    ///
+    /// If you do not wish to display the texture, you must pass the
+    /// [`SurfaceTexture`] to [`self.discard_texture`], so that it can be reused
+    /// by future acquisitions.
    ///
    /// # Portability
    ///
-    /// Some backends can't support a timeout when acquiring a texture and
-    /// the timeout will be ignored.
+    /// Some backends can't support a timeout when acquiring a texture. On these
+    /// backends, `timeout` is ignored.
    ///
-    /// Returns `None` on timing out.
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - The `fence` argument must be the same [`Fence`] passed to all calls to
+    ///   [`Queue::submit`] that used [`Texture`]s acquired from this surface.
+    ///
+    /// - You may only have one texture acquired from `self` at a time. When
+    ///   `acquire_texture` returns `Ok(Some(ast))`, you must pass the returned
+    ///   [`SurfaceTexture`] `ast.texture` to either [`Queue::present`] or
+    ///   [`Surface::discard_texture`] before calling `acquire_texture` again.
+    ///
+    /// [`texture`]: AcquiredSurfaceTexture::texture
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`borrow`]: std::borrow::Borrow::borrow
+    /// [`Texture`]: Api::Texture
+    /// [`Fence`]: Api::Fence
+    /// [`self.discard_texture`]: Surface::discard_texture
    unsafe fn acquire_texture(
        &self,
        timeout: Option<std::time::Duration>,
+        fence: &<Self::A as Api>::Fence,
    ) -> Result<Option<AcquiredSurfaceTexture<Self::A>>, SurfaceError>;
+
+    /// Relinquish an acquired texture without presenting it.
+    ///
+    /// After this call, the texture underlying [`SurfaceTexture`] may be
+    /// returned by subsequent calls to [`self.acquire_texture`].
+    ///
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - `texture` must be a [`SurfaceTexture`] returned by a call to
+    ///   [`self.acquire_texture`] that has not yet been passed to
+    ///   [`Queue::present`].
+    ///
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`self.acquire_texture`]: Surface::acquire_texture
    unsafe fn discard_texture(&self, texture: <Self::A as Api>::SurfaceTexture);
 }

@ -762,17 +819,21 @@ pub trait Queue: WasmNotSendSync {

    /// Submit `command_buffers` for execution on GPU.
    ///
-    /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value`
-    /// when the operation is complete. See [`Fence`] for details.
+    /// Update `fence` to `value` when the operation is complete. See
+    /// [`Fence`] for details.
    ///
-    /// If two calls to `submit` on a single `Queue` occur in a particular order
-    /// (that is, they happen on the same thread, or on two threads that have
-    /// synchronized to establish an ordering), then the first submission's
-    /// commands all complete execution before any of the second submission's
-    /// commands begin. All results produced by one submission are visible to
-    /// the next.
+    /// A `wgpu_hal` queue is "single threaded": all command buffers are
+    /// executed in the order they're submitted, with each buffer able to see
+    /// previous buffers' results. Specifically:
    ///
-    /// Within a submission, command buffers execute in the order in which they
+    /// - If two calls to `submit` on a single `Queue` occur in a particular
+    ///   order (that is, they happen on the same thread, or on two threads that
+    ///   have synchronized to establish an ordering), then the first
+    ///   submission's commands all complete execution before any of the second
+    ///   submission's commands begin. All results produced by one submission
+    ///   are visible to the next.
+    ///
+    /// - Within a submission, command buffers execute in the order in which they
    ///   appear in `command_buffers`. All results produced by one buffer are
    ///   visible to the next.
    ///
@ -803,10 +864,16 @@ pub trait Queue: WasmNotSendSync {
    /// - Every [`SurfaceTexture`][st] that any command in `command_buffers`
    ///   writes to must appear in the `surface_textures` argument.
    ///
+    /// - No [`SurfaceTexture`][st] may appear in the `surface_textures`
+    ///   argument more than once.
+    ///
    /// - Each [`SurfaceTexture`][st] in `surface_textures` must be configured
    ///   for use with the [`Device`][d] associated with this [`Queue`],
    ///   typically by calling [`Surface::configure`].
    ///
+    /// - All calls to this function that include a given [`SurfaceTexture`][st]
+    ///   in `surface_textures` must use the same [`Fence`].
+    ///
    /// [`Fence`]: Api::Fence
    /// [cb]: Api::CommandBuffer
    /// [ce]: Api::CommandEncoder
@ -819,7 +886,7 @@ pub trait Queue: WasmNotSendSync {
        &self,
        command_buffers: &[&<Self::A as Api>::CommandBuffer],
        surface_textures: &[&<Self::A as Api>::SurfaceTexture],
-        signal_fence: Option<(&mut <Self::A as Api>::Fence, FenceValue)>,
+        signal_fence: (&mut <Self::A as Api>::Fence, FenceValue),
    ) -> Result<(), DeviceError>;
    unsafe fn present(
        &self,
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@ -377,14 +377,13 @@ impl crate::Queue for Queue {
        &self,
        command_buffers: &[&CommandBuffer],
        _surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
    ) -> Result<(), crate::DeviceError> {
        objc::rc::autoreleasepool(|| {
-            let extra_command_buffer = match signal_fence {
-                Some((fence, value)) => {
-                    let completed_value = Arc::clone(&fence.completed_value);
+            let extra_command_buffer = {
+                let completed_value = Arc::clone(&signal_fence.completed_value);
                let block = block::ConcreteBlock::new(move |_cmd_buf| {
-                        completed_value.store(value, atomic::Ordering::Release);
+                    completed_value.store(signal_value, atomic::Ordering::Release);
                })
                .copy();

@ -400,15 +399,15 @@ impl crate::Queue for Queue {
                raw.set_label("(wgpu internal) Signal");
                raw.add_completed_handler(&block);

-                    fence.maintain();
-                    fence.pending_command_buffers.push((value, raw.to_owned()));
+                signal_fence.maintain();
+                signal_fence
+                    .pending_command_buffers
+                    .push((signal_value, raw.to_owned()));
                // only return an extra one if it's extra
                match command_buffers.last() {
                    Some(_) => None,
                    None => Some(raw),
                }
-                }
-                None => None,
            };

            for cmd_buffer in command_buffers {
--- a/wgpu-hal/src/metal/surface.rs
+++ b/wgpu-hal/src/metal/surface.rs
@ -242,6 +242,7 @@ impl crate::Surface for super::Surface {
    unsafe fn acquire_texture(
        &self,
        _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
        let render_layer = self.render_layer.lock();
        let (drawable, texture) = match autoreleasepool(|| {
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@ -3,11 +3,7 @@ use super::conv;
 use ash::{amd, ext, khr, vk};
 use parking_lot::Mutex;

-use std::{
-    collections::BTreeMap,
-    ffi::CStr,
-    sync::{atomic::AtomicIsize, Arc},
-};
+use std::{collections::BTreeMap, ffi::CStr, sync::Arc};

 fn depth_stencil_required_flags() -> vk::FormatFeatureFlags {
    vk::FormatFeatureFlags::SAMPLED_IMAGE | vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT
@ -1783,21 +1779,15 @@ impl super::Adapter {
            render_passes: Mutex::new(Default::default()),
            framebuffers: Mutex::new(Default::default()),
        });
-        let mut relay_semaphores = [vk::Semaphore::null(); 2];
-        for sem in relay_semaphores.iter_mut() {
-            unsafe {
-                *sem = shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)?
-            };
-        }
+
+        let relay_semaphores = super::RelaySemaphores::new(&shared)?;
+
        let queue = super::Queue {
            raw: raw_queue,
            swapchain_fn,
            device: Arc::clone(&shared),
            family_index,
-            relay_semaphores,
-            relay_index: AtomicIsize::new(-1),
+            relay_semaphores: Mutex::new(relay_semaphores),
        };

        let mem_allocator = {
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@ -612,17 +612,16 @@ impl super::Device {
        let images =
            unsafe { functor.get_swapchain_images(raw) }.map_err(crate::DeviceError::from)?;

-        // NOTE: It's important that we define at least images.len() + 1 wait
+        // NOTE: It's important that we define at least images.len() wait
        // semaphores, since we prospectively need to provide the call to
        // acquire the next image with an unsignaled semaphore.
-        let surface_semaphores = (0..images.len() + 1)
-            .map(|_| unsafe {
-                self.shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+        let surface_semaphores = (0..=images.len())
+            .map(|_| {
+                super::SwapchainImageSemaphores::new(&self.shared)
+                    .map(Mutex::new)
+                    .map(Arc::new)
            })
-            .collect::<Result<Vec<_>, _>>()
-            .map_err(crate::DeviceError::from)?;
+            .collect::<Result<Vec<_>, _>>()?;

        Ok(super::Swapchain {
            raw,
@ -633,7 +632,7 @@ impl super::Device {
            config: config.clone(),
            view_formats: wgt_view_formats,
            surface_semaphores,
-            next_surface_index: 0,
+            next_semaphore_index: 0,
        })
    }

@ -836,9 +835,12 @@ impl crate::Device for super::Device {
    unsafe fn exit(self, queue: super::Queue) {
        unsafe { self.mem_allocator.into_inner().cleanup(&*self.shared) };
        unsafe { self.desc_allocator.into_inner().cleanup(&*self.shared) };
-        for &sem in queue.relay_semaphores.iter() {
-            unsafe { self.shared.raw.destroy_semaphore(sem, None) };
-        }
+        unsafe {
+            queue
+                .relay_semaphores
+                .into_inner()
+                .destroy(&self.shared.raw)
+        };
        unsafe { self.shared.free_resources() };
    }

@ -2055,54 +2057,7 @@ impl crate::Device for super::Device {
        timeout_ms: u32,
    ) -> Result<bool, crate::DeviceError> {
        let timeout_ns = timeout_ms as u64 * super::MILLIS_TO_NANOS;
-        match *fence {
-            super::Fence::TimelineSemaphore(raw) => {
-                let semaphores = [raw];
-                let values = [wait_value];
-                let vk_info = vk::SemaphoreWaitInfo::default()
-                    .semaphores(&semaphores)
-                    .values(&values);
-                let result = match self.shared.extension_fns.timeline_semaphore {
-                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
-                        ext.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    Some(super::ExtensionFn::Promoted) => unsafe {
-                        self.shared.raw.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    None => unreachable!(),
-                };
-                match result {
-                    Ok(()) => Ok(true),
-                    Err(vk::Result::TIMEOUT) => Ok(false),
-                    Err(other) => Err(other.into()),
-                }
-            }
-            super::Fence::FencePool {
-                last_completed,
-                ref active,
-                free: _,
-            } => {
-                if wait_value <= last_completed {
-                    Ok(true)
-                } else {
-                    match active.iter().find(|&&(value, _)| value >= wait_value) {
-                        Some(&(_, raw)) => {
-                            match unsafe {
-                                self.shared.raw.wait_for_fences(&[raw], true, timeout_ns)
-                            } {
-                                Ok(()) => Ok(true),
-                                Err(vk::Result::TIMEOUT) => Ok(false),
-                                Err(other) => Err(other.into()),
-                            }
-                        }
-                        None => {
-                            log::error!("No signals reached value {}", wait_value);
-                            Err(crate::DeviceError::Lost)
-                        }
-                    }
-                }
-            }
-        }
+        self.shared.wait_for_fence(fence, wait_value, timeout_ns)
    }

    unsafe fn start_capture(&self) -> bool {
@ -2364,6 +2319,71 @@ impl crate::Device for super::Device {
    }
 }

+impl super::DeviceShared {
+    pub(super) fn new_binary_semaphore(&self) -> Result<vk::Semaphore, crate::DeviceError> {
+        unsafe {
+            self.raw
+                .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+                .map_err(crate::DeviceError::from)
+        }
+    }
+
+    pub(super) fn wait_for_fence(
+        &self,
+        fence: &super::Fence,
+        wait_value: crate::FenceValue,
+        timeout_ns: u64,
+    ) -> Result<bool, crate::DeviceError> {
+        profiling::scope!("Device::wait");
+        match *fence {
+            super::Fence::TimelineSemaphore(raw) => {
+                let semaphores = [raw];
+                let values = [wait_value];
+                let vk_info = vk::SemaphoreWaitInfo::default()
+                    .semaphores(&semaphores)
+                    .values(&values);
+                let result = match self.extension_fns.timeline_semaphore {
+                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
+                        ext.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    Some(super::ExtensionFn::Promoted) => unsafe {
+                        self.raw.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    None => unreachable!(),
+                };
+                match result {
+                    Ok(()) => Ok(true),
+                    Err(vk::Result::TIMEOUT) => Ok(false),
+                    Err(other) => Err(other.into()),
+                }
+            }
+            super::Fence::FencePool {
+                last_completed,
+                ref active,
+                free: _,
+            } => {
+                if wait_value <= last_completed {
+                    Ok(true)
+                } else {
+                    match active.iter().find(|&&(value, _)| value >= wait_value) {
+                        Some(&(_, raw)) => {
+                            match unsafe { self.raw.wait_for_fences(&[raw], true, timeout_ns) } {
+                                Ok(()) => Ok(true),
+                                Err(vk::Result::TIMEOUT) => Ok(false),
+                                Err(other) => Err(other.into()),
+                            }
+                        }
+                        None => {
+                            log::error!("No signals reached value {}", wait_value);
+                            Err(crate::DeviceError::Lost)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 impl From<gpu_alloc::AllocationError> for crate::DeviceError {
    fn from(error: gpu_alloc::AllocationError) -> Self {
        use gpu_alloc::AllocationError as Ae;
--- a/wgpu-hal/src/vulkan/instance.rs
+++ b/wgpu-hal/src/vulkan/instance.rs
@ -164,10 +164,14 @@ impl super::Swapchain {
            let _ = unsafe { device.device_wait_idle() };
        };

+        // We cannot take this by value, as the function returns `self`.
        for semaphore in self.surface_semaphores.drain(..) {
-            unsafe {
-                device.destroy_semaphore(semaphore, None);
-            }
+            let arc_removed = Arc::into_inner(semaphore).expect(
+                "Trying to destroy a SurfaceSemaphores that is still in use by a SurfaceTexture",
+            );
+            let mutex_removed = arc_removed.into_inner();
+
+            unsafe { mutex_removed.destroy(device) };
        }

        self
@ -966,9 +970,10 @@ impl crate::Surface for super::Surface {
    unsafe fn acquire_texture(
        &self,
        timeout: Option<std::time::Duration>,
+        fence: &super::Fence,
    ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
        let mut swapchain = self.swapchain.write();
-        let sc = swapchain.as_mut().unwrap();
+        let swapchain = swapchain.as_mut().unwrap();

        let mut timeout_ns = match timeout {
            Some(duration) => duration.as_nanos() as u64,
@ -988,12 +993,40 @@ impl crate::Surface for super::Surface {
            timeout_ns = u64::MAX;
        }

-        let wait_semaphore = sc.surface_semaphores[sc.next_surface_index];
+        let swapchain_semaphores_arc = swapchain.get_surface_semaphores();
+        // Nothing should be using this, so we don't block, but panic if we fail to lock.
+        let locked_swapchain_semaphores = swapchain_semaphores_arc
+            .try_lock()
+            .expect("Failed to lock a SwapchainSemaphores.");
+
+        // Wait for all commands writing to the previously acquired image to
+        // complete.
+        //
+        // Almost all the steps in the usual acquire-draw-present flow are
+        // asynchronous: they get something started on the presentation engine
+        // or the GPU, but on the CPU, control returns immediately. Without some
+        // sort of intervention, the CPU could crank out frames much faster than
+        // the presentation engine can display them.
+        //
+        // This is the intervention: if any submissions drew on this image, and
+        // thus waited for `locked_swapchain_semaphores.acquire`, wait for all
+        // of them to finish, thus ensuring that it's okay to pass `acquire` to
+        // `vkAcquireNextImageKHR` again.
+        swapchain.device.wait_for_fence(
+            fence,
+            locked_swapchain_semaphores.previously_used_submission_index,
+            timeout_ns,
+        )?;

        // will block if no image is available
        let (index, suboptimal) = match unsafe {
-            sc.functor
-                .acquire_next_image(sc.raw, timeout_ns, wait_semaphore, vk::Fence::null())
+            profiling::scope!("vkAcquireNextImageKHR");
+            swapchain.functor.acquire_next_image(
+                swapchain.raw,
+                timeout_ns,
+                locked_swapchain_semaphores.acquire,
+                vk::Fence::null(),
+            )
        } {
            // We treat `VK_SUBOPTIMAL_KHR` as `VK_SUCCESS` on Android.
            // See the comment in `Queue::present`.
@ -1013,16 +1046,18 @@ impl crate::Surface for super::Surface {
            }
        };

-        sc.next_surface_index += 1;
-        sc.next_surface_index %= sc.surface_semaphores.len();
+        drop(locked_swapchain_semaphores);
+        // We only advance the surface semaphores if we successfully acquired an image, otherwise
+        // we should try to re-acquire using the same semaphores.
+        swapchain.advance_surface_semaphores();

        // special case for Intel Vulkan returning bizarre values (ugh)
-        if sc.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
+        if swapchain.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
            return Err(crate::SurfaceError::Outdated);
        }

        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-framebuffer-03209
-        let raw_flags = if sc
+        let raw_flags = if swapchain
            .raw_flags
            .contains(vk::SwapchainCreateFlagsKHR::MUTABLE_FORMAT)
        {
@ -1034,20 +1069,20 @@ impl crate::Surface for super::Surface {
        let texture = super::SurfaceTexture {
            index,
            texture: super::Texture {
-                raw: sc.images[index as usize],
+                raw: swapchain.images[index as usize],
                drop_guard: None,
                block: None,
-                usage: sc.config.usage,
-                format: sc.config.format,
+                usage: swapchain.config.usage,
+                format: swapchain.config.format,
                raw_flags,
                copy_size: crate::CopyExtent {
-                    width: sc.config.extent.width,
-                    height: sc.config.extent.height,
+                    width: swapchain.config.extent.width,
+                    height: swapchain.config.extent.height,
                    depth: 1,
                },
-                view_formats: sc.view_formats.clone(),
+                view_formats: swapchain.view_formats.clone(),
            },
-            wait_semaphore,
+            surface_semaphores: swapchain_semaphores_arc,
        };
        Ok(Some(crate::AcquiredSurfaceTexture {
            texture,
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@ -33,13 +33,11 @@ mod instance;

 use std::{
    borrow::Borrow,
+    collections::HashSet,
    ffi::{CStr, CString},
-    fmt,
+    fmt, mem,
    num::NonZeroU32,
-    sync::{
-        atomic::{AtomicIsize, Ordering},
-        Arc,
-    },
+    sync::Arc,
 };

 use arrayvec::ArrayVec;
@ -147,6 +145,173 @@ pub struct Instance {
    shared: Arc<InstanceShared>,
 }

+/// The semaphores needed to use one image in a swapchain.
+#[derive(Debug)]
+struct SwapchainImageSemaphores {
+    /// A semaphore that is signaled when this image is safe for us to modify.
+    ///
+    /// When [`vkAcquireNextImageKHR`] returns the index of the next swapchain
+    /// image that we should use, that image may actually still be in use by the
+    /// presentation engine, and is not yet safe to modify. However, that
+    /// function does accept a semaphore that it will signal when the image is
+    /// indeed safe to begin messing with.
+    ///
+    /// This semaphore is:
+    ///
+    /// - waited for by the first queue submission to operate on this image
+    ///   since it was acquired, and
+    ///
+    /// - signaled by [`vkAcquireNextImageKHR`] when the acquired image is ready
+    ///   for us to use.
+    ///
+    /// [`vkAcquireNextImageKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkAcquireNextImageKHR
+    acquire: vk::Semaphore,
+
+    /// True if the next command submission operating on this image should wait
+    /// for [`acquire`].
+    ///
+    /// We must wait for `acquire` before drawing to this swapchain image, but
+    /// because `wgpu-hal` queue submissions are always strongly ordered, only
+    /// the first submission that works with a swapchain image actually needs to
+    /// wait. We set this flag when this image is acquired, and clear it the
+    /// first time it's passed to [`Queue::submit`] as a surface texture.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    /// [`Queue::submit`]: crate::Queue::submit
+    should_wait_for_acquire: bool,
+
+    /// A pool of semaphores for ordering presentation after drawing.
+    ///
+    /// The first [`present_index`] semaphores in this vector are:
+    ///
+    /// - all waited on by the call to [`vkQueuePresentKHR`] that presents this
+    ///   image, and
+    ///
+    /// - each signaled by some [`vkQueueSubmit`] queue submission that draws to
+    ///   this image, when the submission finishes execution.
+    ///
+    /// This vector accumulates one semaphore per submission that writes to this
+    /// image. This is awkward, but hard to avoid: [`vkQueuePresentKHR`]
+    /// requires a semaphore to order it with respect to drawing commands, and
+    /// we can't attach new completion semaphores to a command submission after
+    /// it's been submitted. This means that, at submission time, we must create
+    /// the semaphore we might need if the caller's next action is to enqueue a
+    /// presentation of this image.
+    ///
+    /// An alternative strategy would be for presentation to enqueue an empty
+    /// submit, ordered relative to other submits in the usual way, and
+    /// signaling a single presentation semaphore. But we suspect that submits
+    /// are usually expensive enough, and semaphores usually cheap enough, that
+    /// performance-sensitive users will avoid making many submits, so that the
+    /// cost of accumulated semaphores will usually be less than the cost of an
+    /// additional submit.
+    ///
+    /// Only the first [`present_index`] semaphores in the vector are actually
+    /// going to be signalled by submitted commands, and need to be waited for
+    /// by the next present call. Any semaphores beyond that index were created
+    /// for prior presents and are simply being retained for recycling.
+    ///
+    /// [`present_index`]: SwapchainImageSemaphores::present_index
+    /// [`vkQueuePresentKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueuePresentKHR
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    present: Vec<vk::Semaphore>,
+
+    /// The number of semaphores in [`present`] to be signalled for this submission.
+    ///
+    /// [`present`]: SwapchainImageSemaphores::present
+    present_index: usize,
+
+    /// The fence value of the last command submission that wrote to this image.
+    ///
+    /// The next time we try to acquire this image, we'll block until
+    /// this submission finishes, proving that [`acquire`] is ready to
+    /// pass to `vkAcquireNextImageKHR` again.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    previously_used_submission_index: crate::FenceValue,
+}
+
+impl SwapchainImageSemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            acquire: device.new_binary_semaphore()?,
+            should_wait_for_acquire: true,
+            present: Vec::new(),
+            present_index: 0,
+            previously_used_submission_index: 0,
+        })
+    }
+
+    fn set_used_fence_value(&mut self, value: crate::FenceValue) {
+        self.previously_used_submission_index = value;
+    }
+
+    /// Return the semaphore that commands drawing to this image should wait for, if any.
+    ///
+    /// This only returns `Some` once per acquisition; see
+    /// [`SwapchainImageSemaphores::should_wait_for_acquire`] for details.
+    fn get_acquire_wait_semaphore(&mut self) -> Option<vk::Semaphore> {
+        if self.should_wait_for_acquire {
+            self.should_wait_for_acquire = false;
+            Some(self.acquire)
+        } else {
+            None
+        }
+    }
+
+    /// Return a semaphore that a submission that writes to this image should
+    /// signal when it's done.
+    ///
+    /// See [`SwapchainImageSemaphores::present`] for details.
+    fn get_submit_signal_semaphore(
+        &mut self,
+        device: &DeviceShared,
+    ) -> Result<vk::Semaphore, crate::DeviceError> {
+        // Try to recycle a semaphore we created for a previous presentation.
+        let sem = match self.present.get(self.present_index) {
+            Some(sem) => *sem,
+            None => {
+                let sem = device.new_binary_semaphore()?;
+                self.present.push(sem);
+                sem
+            }
+        };
+
+        self.present_index += 1;
+
+        Ok(sem)
+    }
+
+    /// Return the semaphores that a presentation of this image should wait on.
+    ///
+    /// Return a slice of semaphores that the call to [`vkQueueSubmit`] that
+    /// ends this image's acquisition should wait for. See
+    /// [`SwapchainImageSemaphores::present`] for details.
+    ///
+    /// Reset `self` to be ready for the next acquisition cycle.
+    ///
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    fn get_present_wait_semaphores(&mut self) -> &[vk::Semaphore] {
+        let old_index = self.present_index;
+
+        // Since this marks the end of this acquire/draw/present cycle, take the
+        // opportunity to reset `self` in preparation for the next acquisition.
+        self.present_index = 0;
+        self.should_wait_for_acquire = true;
+
+        &self.present[0..old_index]
+    }
+
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            device.destroy_semaphore(self.acquire, None);
+            for sem in &self.present {
+                device.destroy_semaphore(*sem, None);
+            }
+        }
+    }
+}
+
 struct Swapchain {
    raw: vk::SwapchainKHR,
    raw_flags: vk::SwapchainCreateFlagsKHR,
@ -157,9 +322,25 @@ struct Swapchain {
    view_formats: Vec<wgt::TextureFormat>,
    /// One wait semaphore per swapchain image. This will be associated with the
    /// surface texture, and later collected during submission.
-    surface_semaphores: Vec<vk::Semaphore>,
-    /// Current semaphore index to use when acquiring a surface.
-    next_surface_index: usize,
+    ///
+    /// We need this to be `Arc<Mutex<>>` because we need to be able to pass this
+    /// data into the surface texture, so submit/present can use it.
+    surface_semaphores: Vec<Arc<Mutex<SwapchainImageSemaphores>>>,
+    /// The index of the next semaphore to use. Ideally we would use the same
+    /// index as the image index, but we need to specify the semaphore as an argument
+    /// to the acquire_next_image function which is what tells us which image to use.
+    next_semaphore_index: usize,
+}
+
+impl Swapchain {
+    fn advance_surface_semaphores(&mut self) {
+        let semaphore_count = self.surface_semaphores.len();
+        self.next_semaphore_index = (self.next_semaphore_index + 1) % semaphore_count;
+    }
+
+    fn get_surface_semaphores(&self) -> Arc<Mutex<SwapchainImageSemaphores>> {
+        self.surface_semaphores[self.next_semaphore_index].clone()
+    }
 }

 pub struct Surface {
@ -173,7 +354,7 @@ pub struct Surface {
 pub struct SurfaceTexture {
    index: u32,
    texture: Texture,
-    wait_semaphore: vk::Semaphore,
+    surface_semaphores: Arc<Mutex<SwapchainImageSemaphores>>,
 }

 impl Borrow<Texture> for SurfaceTexture {
@ -359,18 +540,87 @@ pub struct Device {
    render_doc: crate::auxil::renderdoc::RenderDoc,
 }

+/// Semaphores for forcing queue submissions to run in order.
+///
+/// The [`wgpu_hal::Queue`] trait promises that if two calls to [`submit`] are
+/// ordered, then the first submission will finish on the GPU before the second
+/// submission begins. To get this behavior on Vulkan we need to pass semaphores
+/// to [`vkQueueSubmit`] for the commands to wait on before beginning execution,
+/// and to signal when their execution is done.
+///
+/// Normally this can be done with a single semaphore, waited on and then
+/// signalled for each submission. At any given time there's exactly one
+/// submission that would signal the semaphore, and exactly one waiting on it,
+/// as Vulkan requires.
+///
+/// However, as of Oct 2021, bug [#5508] in the Mesa ANV drivers caused them to
+/// hang if we use a single semaphore. The workaround is to alternate between
+/// two semaphores. The bug has been fixed in Mesa, but we should probably keep
+/// the workaround until, say, Oct 2026.
+///
+/// [`wgpu_hal::Queue`]: crate::Queue
+/// [`submit`]: crate::Queue::submit
+/// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+/// [#5508]: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508
+#[derive(Clone)]
+struct RelaySemaphores {
+    /// The semaphore the next submission should wait on before beginning
+    /// execution on the GPU. This is `None` for the first submission, which
+    /// should not wait on anything at all.
+    wait: Option<vk::Semaphore>,
+
+    /// The semaphore the next submission should signal when it has finished
+    /// execution on the GPU.
+    signal: vk::Semaphore,
+}
+
+impl RelaySemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            wait: None,
+            signal: device.new_binary_semaphore()?,
+        })
+    }
+
+    /// Advances the semaphores, returning the semaphores that should be used for a submission.
+    fn advance(&mut self, device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        let old = self.clone();
+
+        // Build the state for the next submission.
+        match self.wait {
+            None => {
+                // The `old` values describe the first submission to this queue.
+                // The second submission should wait on `old.signal`, and then
+                // signal a new semaphore which we'll create now.
+                self.wait = Some(old.signal);
+                self.signal = device.new_binary_semaphore()?;
+            }
+            Some(ref mut wait) => {
+                // What this submission signals, the next should wait.
+                mem::swap(wait, &mut self.signal);
+            }
+        };
+
+        Ok(old)
+    }
+
+    /// Destroys the semaphores.
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            if let Some(wait) = self.wait {
+                device.destroy_semaphore(wait, None);
+            }
+            device.destroy_semaphore(self.signal, None);
+        }
+    }
+}
+
 pub struct Queue {
    raw: vk::Queue,
    swapchain_fn: khr::swapchain::Device,
    device: Arc<DeviceShared>,
    family_index: u32,
-    /// We use a redundant chain of semaphores to pass on the signal
-    /// from submissions to the last present, since it's required by the
-    /// specification.
-    /// It would be correct to use a single semaphore there, but
-    /// [Intel hangs in `anv_queue_finish`](https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508).
-    relay_semaphores: [vk::Semaphore; 2],
-    relay_index: AtomicIsize,
+    relay_semaphores: Mutex<RelaySemaphores>,
 }

 #[derive(Debug)]
@ -702,42 +952,74 @@ impl crate::Queue for Queue {
        &self,
        command_buffers: &[&CommandBuffer],
        surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
    ) -> Result<(), crate::DeviceError> {
        let mut fence_raw = vk::Fence::null();

        let mut wait_stage_masks = Vec::new();
        let mut wait_semaphores = Vec::new();
-        let mut signal_semaphores = ArrayVec::<_, 2>::new();
-        let mut signal_values = ArrayVec::<_, 2>::new();
+        let mut signal_semaphores = Vec::new();
+        let mut signal_values = Vec::new();

-        for &surface_texture in surface_textures {
+        // Double check that the same swapchain image isn't being given to us multiple times,
+        // as that will deadlock when we try to lock them all.
+        debug_assert!(
+            {
+                let mut check = HashSet::with_capacity(surface_textures.len());
+                // We compare the Arcs by pointer, as Eq isn't well defined for SurfaceSemaphores.
+                for st in surface_textures {
+                    check.insert(Arc::as_ptr(&st.surface_semaphores));
+                }
+                check.len() == surface_textures.len()
+            },
+            "More than one surface texture is being used from the same swapchain. This will cause a deadlock in release."
+        );
+
+        let locked_swapchain_semaphores = surface_textures
+            .iter()
+            .map(|st| {
+                st.surface_semaphores
+                    .try_lock()
+                    .expect("Failed to lock surface semaphore.")
+            })
+            .collect::<Vec<_>>();
+
+        for mut swapchain_semaphore in locked_swapchain_semaphores {
+            swapchain_semaphore.set_used_fence_value(signal_value);
+
+            // If we're the first submission to operate on this image, wait on
+            // its acquire semaphore, to make sure the presentation engine is
+            // done with it.
+            if let Some(sem) = swapchain_semaphore.get_acquire_wait_semaphore() {
                wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(surface_texture.wait_semaphore);
+                wait_semaphores.push(sem);
            }

-        let old_index = self.relay_index.load(Ordering::Relaxed);
+            // Get a semaphore to signal when we're done writing to this surface
+            // image. Presentation of this image will wait for this.
+            let signal_semaphore = swapchain_semaphore.get_submit_signal_semaphore(&self.device)?;
+            signal_semaphores.push(signal_semaphore);
+            signal_values.push(!0);
+        }

-        let sem_index = if old_index >= 0 {
+        // In order for submissions to be strictly ordered, we encode a dependency between each submission
+        // using a pair of semaphores. This adds a wait if it is needed, and signals the next semaphore.
+        let semaphore_state = self.relay_semaphores.lock().advance(&self.device)?;
+
+        if let Some(sem) = semaphore_state.wait {
            wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(self.relay_semaphores[old_index as usize]);
-            (old_index as usize + 1) % self.relay_semaphores.len()
-        } else {
-            0
-        };
+            wait_semaphores.push(sem);
+        }

-        signal_semaphores.push(self.relay_semaphores[sem_index]);
+        signal_semaphores.push(semaphore_state.signal);
+        signal_values.push(!0);

-        self.relay_index
-            .store(sem_index as isize, Ordering::Relaxed);
-
-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(&self.device.raw)?;
-            match *fence {
+        // We need to signal our wgpu::Fence if we have one, this adds it to the signal list.
+        signal_fence.maintain(&self.device.raw)?;
+        match *signal_fence {
            Fence::TimelineSemaphore(raw) => {
                signal_semaphores.push(raw);
-                    signal_values.push(!0);
-                    signal_values.push(value);
+                signal_values.push(signal_value);
            }
            Fence::FencePool {
                ref mut active,
@ -752,8 +1034,7 @@ impl crate::Queue for Queue {
                            .create_fence(&vk::FenceCreateInfo::default(), None)?
                    },
                };
-                    active.push((value, fence_raw));
-                }
+                active.push((signal_value, fence_raw));
            }
        }

@ -771,7 +1052,7 @@ impl crate::Queue for Queue {

        let mut vk_timeline_info;

-        if !signal_values.is_empty() {
+        if self.device.private_caps.timeline_semaphores {
            vk_timeline_info =
                vk::TimelineSemaphoreSubmitInfo::default().signal_semaphore_values(&signal_values);
            vk_info = vk_info.push_next(&mut vk_timeline_info);
@ -793,19 +1074,14 @@ impl crate::Queue for Queue {
    ) -> Result<(), crate::SurfaceError> {
        let mut swapchain = surface.swapchain.write();
        let ssc = swapchain.as_mut().unwrap();
+        let mut swapchain_semaphores = texture.surface_semaphores.lock();

        let swapchains = [ssc.raw];
        let image_indices = [texture.index];
-        let mut vk_info = vk::PresentInfoKHR::default()
+        let vk_info = vk::PresentInfoKHR::default()
            .swapchains(&swapchains)
-            .image_indices(&image_indices);
-
-        let old_index = self.relay_index.swap(-1, Ordering::Relaxed);
-        if old_index >= 0 {
-            vk_info = vk_info.wait_semaphores(
-                &self.relay_semaphores[old_index as usize..old_index as usize + 1],
-            );
-        }
+            .image_indices(&image_indices)
+            .wait_semaphores(swapchain_semaphores.get_present_wait_semaphores());

        let suboptimal = {
            profiling::scope!("vkQueuePresentKHR");