From c7458638d14921c7562e4197ddeefa17be413587 Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Thu, 30 May 2024 16:53:34 -0400 Subject: [PATCH] [hal/vk] Rework Submission and Surface Synchronization (#5681) Fix two major synchronization issues in `wgpu_val::vulkan`: - Properly order queue command buffer submissions. Due to Mesa bugs, two semaphores are required even though the Vulkan spec says that only one should be necessary. - Properly manage surface texture acquisition and presentation: - Acquiring a surface texture can return while the presentation engine is still displaying the texture. Applications must wait for a semaphore to be signaled before using the acquired texture. - Presenting a surface texture requires a semaphore to ensure that drawing is complete before presentation occurs. Co-authored-by: Jim Blandy --- wgpu-core/src/device/queue.rs | 2 +- wgpu-core/src/present.rs | 15 +- wgpu-hal/examples/halmark/main.rs | 73 ++-- wgpu-hal/examples/raw-gles.rs | 3 +- wgpu-hal/examples/ray-traced-triangle/main.rs | 73 ++-- wgpu-hal/src/dx12/mod.rs | 11 +- wgpu-hal/src/empty.rs | 3 +- wgpu-hal/src/gles/egl.rs | 1 + wgpu-hal/src/gles/queue.rs | 12 +- wgpu-hal/src/gles/web.rs | 1 + wgpu-hal/src/gles/wgl.rs | 1 + wgpu-hal/src/lib.rs | 113 ++++- wgpu-hal/src/metal/mod.rs | 53 ++- wgpu-hal/src/metal/surface.rs | 1 + wgpu-hal/src/vulkan/adapter.rs | 20 +- wgpu-hal/src/vulkan/device.rs | 140 +++--- wgpu-hal/src/vulkan/instance.rs | 71 ++- wgpu-hal/src/vulkan/mod.rs | 408 +++++++++++++++--- 18 files changed, 698 insertions(+), 303 deletions(-) diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 168b36843..8eb46f0aa 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -1499,7 +1499,7 @@ impl Global { .raw .as_ref() .unwrap() - .submit(&refs, &submit_surface_textures, Some((fence, submit_index))) + .submit(&refs, &submit_surface_textures, (fence, submit_index)) .map_err(DeviceError::from)?; } diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs index 053f7fdb2..7f5939feb 100644 --- a/wgpu-core/src/present.rs +++ b/wgpu-core/src/present.rs @@ -154,17 +154,20 @@ impl Global { parent_id: surface_id, }); } - #[cfg(not(feature = "trace"))] - let _ = device; + + let fence_guard = device.fence.read(); + let fence = fence_guard.as_ref().unwrap(); let suf = A::surface_as_hal(surface.as_ref()); let (texture_id, status) = match unsafe { - suf.unwrap() - .acquire_texture(Some(std::time::Duration::from_millis( - FRAME_TIMEOUT_MS as u64, - ))) + suf.unwrap().acquire_texture( + Some(std::time::Duration::from_millis(FRAME_TIMEOUT_MS as u64)), + fence, + ) } { Ok(Some(ast)) => { + drop(fence_guard); + let texture_desc = wgt::TextureDescriptor { label: (), size: wgt::Extent3d { diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs index 560aa6f8c..81474f233 100644 --- a/wgpu-hal/examples/halmark/main.rs +++ b/wgpu-hal/examples/halmark/main.rs @@ -22,7 +22,6 @@ const MAX_BUNNIES: usize = 1 << 20; const BUNNY_SIZE: f32 = 0.15 * 256.0; const GRAVITY: f32 = -9.8 * 100.0; const MAX_VELOCITY: f32 = 750.0; -const COMMAND_BUFFER_PER_CONTEXT: usize = 100; const DESIRED_MAX_LATENCY: u32 = 2; #[repr(C)] @@ -498,7 +497,7 @@ impl Example { let mut fence = device.create_fence().unwrap(); let init_cmd = cmd_encoder.end_encoding().unwrap(); queue - .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value))) + .submit(&[&init_cmd], &[], (&mut fence, init_fence_value)) .unwrap(); device.wait(&fence, init_fence_value, !0).unwrap(); device.destroy_buffer(staging_buffer); @@ -550,7 +549,7 @@ impl Example { { let ctx = &mut self.contexts[self.context_index]; self.queue - .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value))) + .submit(&[], &[], (&mut ctx.fence, ctx.fence_value)) .unwrap(); } @@ -650,7 +649,13 @@ impl Example { let ctx = &mut self.contexts[self.context_index]; - let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture }; + let surface_tex = unsafe { + self.surface + .acquire_texture(None, &ctx.fence) + .unwrap() + .unwrap() + .texture + }; let target_barrier0 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -718,7 +723,6 @@ impl Example { } ctx.frames_recorded += 1; - let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT; let target_barrier1 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -732,45 +736,42 @@ impl Example { unsafe { let cmd_buf = ctx.encoder.end_encoding().unwrap(); - let fence_param = if do_fence { - Some((&mut ctx.fence, ctx.fence_value)) - } else { - None - }; self.queue - .submit(&[&cmd_buf], &[&surface_tex], fence_param) + .submit( + &[&cmd_buf], + &[&surface_tex], + (&mut ctx.fence, ctx.fence_value), + ) .unwrap(); self.queue.present(&self.surface, surface_tex).unwrap(); ctx.used_cmd_bufs.push(cmd_buf); ctx.used_views.push(surface_tex_view); }; - if do_fence { - log::debug!("Context switch from {}", self.context_index); - let old_fence_value = ctx.fence_value; - if self.contexts.len() == 1 { - let hal_desc = hal::CommandEncoderDescriptor { - label: None, - queue: &self.queue, - }; - self.contexts.push(unsafe { - ExecutionContext { - encoder: self.device.create_command_encoder(&hal_desc).unwrap(), - fence: self.device.create_fence().unwrap(), - fence_value: 0, - used_views: Vec::new(), - used_cmd_bufs: Vec::new(), - frames_recorded: 0, - } - }); - } - self.context_index = (self.context_index + 1) % self.contexts.len(); - let next = &mut self.contexts[self.context_index]; - unsafe { - next.wait_and_clear(&self.device); - } - next.fence_value = old_fence_value + 1; + log::debug!("Context switch from {}", self.context_index); + let old_fence_value = ctx.fence_value; + if self.contexts.len() == 1 { + let hal_desc = hal::CommandEncoderDescriptor { + label: None, + queue: &self.queue, + }; + self.contexts.push(unsafe { + ExecutionContext { + encoder: self.device.create_command_encoder(&hal_desc).unwrap(), + fence: self.device.create_fence().unwrap(), + fence_value: 0, + used_views: Vec::new(), + used_cmd_bufs: Vec::new(), + frames_recorded: 0, + } + }); } + self.context_index = (self.context_index + 1) % self.contexts.len(); + let next = &mut self.contexts[self.context_index]; + unsafe { + next.wait_and_clear(&self.device); + } + next.fence_value = old_fence_value + 1; } } diff --git a/wgpu-hal/examples/raw-gles.rs b/wgpu-hal/examples/raw-gles.rs index 342100e1c..675a51869 100644 --- a/wgpu-hal/examples/raw-gles.rs +++ b/wgpu-hal/examples/raw-gles.rs @@ -156,6 +156,7 @@ fn fill_screen(exposed: &hal::ExposedAdapter, width: u32, height }) .unwrap() }; + let mut fence = unsafe { od.device.create_fence().unwrap() }; let rp_desc = hal::RenderPassDescriptor { label: None, extent: wgt::Extent3d { @@ -183,6 +184,6 @@ fn fill_screen(exposed: &hal::ExposedAdapter, width: u32, height encoder.begin_render_pass(&rp_desc); encoder.end_render_pass(); let cmd_buf = encoder.end_encoding().unwrap(); - od.queue.submit(&[&cmd_buf], &[], None).unwrap(); + od.queue.submit(&[&cmd_buf], &[], (&mut fence, 0)).unwrap(); } } diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs index 90f0e6fc5..cf0e146ec 100644 --- a/wgpu-hal/examples/ray-traced-triangle/main.rs +++ b/wgpu-hal/examples/ray-traced-triangle/main.rs @@ -13,7 +13,6 @@ use std::{ }; use winit::window::WindowButtons; -const COMMAND_BUFFER_PER_CONTEXT: usize = 100; const DESIRED_MAX_LATENCY: u32 = 2; /// [D3D12_RAYTRACING_INSTANCE_DESC](https://microsoft.github.io/DirectX-Specs/d3d/Raytracing.html#d3d12_raytracing_instance_desc) @@ -759,7 +758,7 @@ impl Example { let mut fence = device.create_fence().unwrap(); let init_cmd = cmd_encoder.end_encoding().unwrap(); queue - .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value))) + .submit(&[&init_cmd], &[], (&mut fence, init_fence_value)) .unwrap(); device.wait(&fence, init_fence_value, !0).unwrap(); cmd_encoder.reset_all(iter::once(init_cmd)); @@ -808,7 +807,13 @@ impl Example { fn render(&mut self) { let ctx = &mut self.contexts[self.context_index]; - let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture }; + let surface_tex = unsafe { + self.surface + .acquire_texture(None, &ctx.fence) + .unwrap() + .unwrap() + .texture + }; let target_barrier0 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -909,7 +914,6 @@ impl Example { } ctx.frames_recorded += 1; - let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT; let target_barrier1 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -959,45 +963,42 @@ impl Example { unsafe { let cmd_buf = ctx.encoder.end_encoding().unwrap(); - let fence_param = if do_fence { - Some((&mut ctx.fence, ctx.fence_value)) - } else { - None - }; self.queue - .submit(&[&cmd_buf], &[&surface_tex], fence_param) + .submit( + &[&cmd_buf], + &[&surface_tex], + (&mut ctx.fence, ctx.fence_value), + ) .unwrap(); self.queue.present(&self.surface, surface_tex).unwrap(); ctx.used_cmd_bufs.push(cmd_buf); ctx.used_views.push(surface_tex_view); }; - if do_fence { - log::info!("Context switch from {}", self.context_index); - let old_fence_value = ctx.fence_value; - if self.contexts.len() == 1 { - let hal_desc = hal::CommandEncoderDescriptor { - label: None, - queue: &self.queue, - }; - self.contexts.push(unsafe { - ExecutionContext { - encoder: self.device.create_command_encoder(&hal_desc).unwrap(), - fence: self.device.create_fence().unwrap(), - fence_value: 0, - used_views: Vec::new(), - used_cmd_bufs: Vec::new(), - frames_recorded: 0, - } - }); - } - self.context_index = (self.context_index + 1) % self.contexts.len(); - let next = &mut self.contexts[self.context_index]; - unsafe { - next.wait_and_clear(&self.device); - } - next.fence_value = old_fence_value + 1; + log::info!("Context switch from {}", self.context_index); + let old_fence_value = ctx.fence_value; + if self.contexts.len() == 1 { + let hal_desc = hal::CommandEncoderDescriptor { + label: None, + queue: &self.queue, + }; + self.contexts.push(unsafe { + ExecutionContext { + encoder: self.device.create_command_encoder(&hal_desc).unwrap(), + fence: self.device.create_fence().unwrap(), + fence_value: 0, + used_views: Vec::new(), + used_cmd_bufs: Vec::new(), + frames_recorded: 0, + } + }); } + self.context_index = (self.context_index + 1) % self.contexts.len(); + let next = &mut self.contexts[self.context_index]; + unsafe { + next.wait_and_clear(&self.device); + } + next.fence_value = old_fence_value + 1; } fn exit(mut self) { @@ -1005,7 +1006,7 @@ impl Example { { let ctx = &mut self.contexts[self.context_index]; self.queue - .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value))) + .submit(&[], &[], (&mut ctx.fence, ctx.fence_value)) .unwrap(); } diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs index 99800e87c..9d5f62f91 100644 --- a/wgpu-hal/src/dx12/mod.rs +++ b/wgpu-hal/src/dx12/mod.rs @@ -857,6 +857,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, timeout: Option, + _fence: &Fence, ) -> Result>, crate::SurfaceError> { let mut swapchain = self.swap_chain.write(); let sc = swapchain.as_mut().unwrap(); @@ -895,7 +896,7 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], _surface_textures: &[&Texture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let mut temp_lists = self.temp_lists.lock(); temp_lists.clear(); @@ -908,11 +909,9 @@ impl crate::Queue for Queue { self.raw.execute_command_lists(&temp_lists); } - if let Some((fence, value)) = signal_fence { - self.raw - .signal(&fence.raw, value) - .into_device_result("Signal fence")?; - } + self.raw + .signal(&signal_fence.raw, signal_value) + .into_device_result("Signal fence")?; // Note the lack of synchronization here between the main Direct queue // and the dedicated presentation queue. This is automatically handled diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs index f1986f770..8cba9d063 100644 --- a/wgpu-hal/src/empty.rs +++ b/wgpu-hal/src/empty.rs @@ -75,6 +75,7 @@ impl crate::Surface for Context { unsafe fn acquire_texture( &self, timeout: Option, + fence: &Resource, ) -> Result>, crate::SurfaceError> { Ok(None) } @@ -114,7 +115,7 @@ impl crate::Queue for Context { &self, command_buffers: &[&Resource], surface_textures: &[&Resource], - signal_fence: Option<(&mut Resource, crate::FenceValue)>, + signal_fence: (&mut Resource, crate::FenceValue), ) -> DeviceResult<()> { Ok(()) } diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs index 5ddf9b48b..07cd8e835 100644 --- a/wgpu-hal/src/gles/egl.rs +++ b/wgpu-hal/src/gles/egl.rs @@ -1432,6 +1432,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs index f6b55a449..95eff36d5 100644 --- a/wgpu-hal/src/gles/queue.rs +++ b/wgpu-hal/src/gles/queue.rs @@ -1740,7 +1740,7 @@ impl crate::Queue for super::Queue { &self, command_buffers: &[&super::CommandBuffer], _surface_textures: &[&super::Texture], - signal_fence: Option<(&mut super::Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut super::Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let shared = Arc::clone(&self.shared); let gl = &shared.context.lock(); @@ -1774,12 +1774,10 @@ impl crate::Queue for super::Queue { } } - if let Some((fence, value)) = signal_fence { - fence.maintain(gl); - let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) } - .map_err(|_| crate::DeviceError::OutOfMemory)?; - fence.pending.push((value, sync)); - } + signal_fence.maintain(gl); + let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) } + .map_err(|_| crate::DeviceError::OutOfMemory)?; + signal_fence.pending.push((signal_value, sync)); Ok(()) } diff --git a/wgpu-hal/src/gles/web.rs b/wgpu-hal/src/gles/web.rs index ab2ccef8b..081f7da5d 100644 --- a/wgpu-hal/src/gles/web.rs +++ b/wgpu-hal/src/gles/web.rs @@ -427,6 +427,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/gles/wgl.rs b/wgpu-hal/src/gles/wgl.rs index aae70478b..1111d98f8 100644 --- a/wgpu-hal/src/gles/wgl.rs +++ b/wgpu-hal/src/gles/wgl.rs @@ -798,6 +798,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index da3834bcb..e81fad403 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -459,44 +459,101 @@ pub trait Instance: Sized + WasmNotSendSync { pub trait Surface: WasmNotSendSync { type A: Api; - /// Configures the surface to use the given device. + /// Configure `self` to use `device`. /// /// # Safety /// - /// - All gpu work that uses the surface must have been completed. + /// - All GPU work using `self` must have been completed. /// - All [`AcquiredSurfaceTexture`]s must have been destroyed. /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed. - /// - All surfaces created using other devices must have been unconfigured before this call. + /// - The surface `self` must not currently be configured to use any other [`Device`]. unsafe fn configure( &self, device: &::Device, config: &SurfaceConfiguration, ) -> Result<(), SurfaceError>; - /// Unconfigures the surface on the given device. + /// Unconfigure `self` on `device`. /// /// # Safety /// - /// - All gpu work that uses the surface must have been completed. + /// - All GPU work that uses `surface` must have been completed. /// - All [`AcquiredSurfaceTexture`]s must have been destroyed. /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed. - /// - The surface must have been configured on the given device. + /// - The surface `self` must have been configured on `device`. unsafe fn unconfigure(&self, device: &::Device); - /// Returns the next texture to be presented by the swapchain for drawing + /// Return the next texture to be presented by `self`, for the caller to draw on. /// - /// A `timeout` of `None` means to wait indefinitely, with no timeout. + /// On success, return an [`AcquiredSurfaceTexture`] representing the + /// texture into which the caller should draw the image to be displayed on + /// `self`. + /// + /// If `timeout` elapses before `self` has a texture ready to be acquired, + /// return `Ok(None)`. If `timeout` is `None`, wait indefinitely, with no + /// timeout. + /// + /// # Using an [`AcquiredSurfaceTexture`] + /// + /// On success, this function returns an [`AcquiredSurfaceTexture`] whose + /// [`texture`] field is a [`SurfaceTexture`] from which the caller can + /// [`borrow`] a [`Texture`] to draw on. The [`AcquiredSurfaceTexture`] also + /// carries some metadata about that [`SurfaceTexture`]. + /// + /// All calls to [`Queue::submit`] that draw on that [`Texture`] must also + /// include the [`SurfaceTexture`] in the `surface_textures` argument. + /// + /// When you are done drawing on the texture, you can display it on `self` + /// by passing the [`SurfaceTexture`] and `self` to [`Queue::present`]. + /// + /// If you do not wish to display the texture, you must pass the + /// [`SurfaceTexture`] to [`self.discard_texture`], so that it can be reused + /// by future acquisitions. /// /// # Portability /// - /// Some backends can't support a timeout when acquiring a texture and - /// the timeout will be ignored. + /// Some backends can't support a timeout when acquiring a texture. On these + /// backends, `timeout` is ignored. /// - /// Returns `None` on timing out. + /// # Safety + /// + /// - The surface `self` must currently be configured on some [`Device`]. + /// + /// - The `fence` argument must be the same [`Fence`] passed to all calls to + /// [`Queue::submit`] that used [`Texture`]s acquired from this surface. + /// + /// - You may only have one texture acquired from `self` at a time. When + /// `acquire_texture` returns `Ok(Some(ast))`, you must pass the returned + /// [`SurfaceTexture`] `ast.texture` to either [`Queue::present`] or + /// [`Surface::discard_texture`] before calling `acquire_texture` again. + /// + /// [`texture`]: AcquiredSurfaceTexture::texture + /// [`SurfaceTexture`]: Api::SurfaceTexture + /// [`borrow`]: std::borrow::Borrow::borrow + /// [`Texture`]: Api::Texture + /// [`Fence`]: Api::Fence + /// [`self.discard_texture`]: Surface::discard_texture unsafe fn acquire_texture( &self, timeout: Option, + fence: &::Fence, ) -> Result>, SurfaceError>; + + /// Relinquish an acquired texture without presenting it. + /// + /// After this call, the texture underlying [`SurfaceTexture`] may be + /// returned by subsequent calls to [`self.acquire_texture`]. + /// + /// # Safety + /// + /// - The surface `self` must currently be configured on some [`Device`]. + /// + /// - `texture` must be a [`SurfaceTexture`] returned by a call to + /// [`self.acquire_texture`] that has not yet been passed to + /// [`Queue::present`]. + /// + /// [`SurfaceTexture`]: Api::SurfaceTexture + /// [`self.acquire_texture`]: Surface::acquire_texture unsafe fn discard_texture(&self, texture: ::SurfaceTexture); } @@ -762,19 +819,23 @@ pub trait Queue: WasmNotSendSync { /// Submit `command_buffers` for execution on GPU. /// - /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value` - /// when the operation is complete. See [`Fence`] for details. + /// Update `fence` to `value` when the operation is complete. See + /// [`Fence`] for details. /// - /// If two calls to `submit` on a single `Queue` occur in a particular order - /// (that is, they happen on the same thread, or on two threads that have - /// synchronized to establish an ordering), then the first submission's - /// commands all complete execution before any of the second submission's - /// commands begin. All results produced by one submission are visible to - /// the next. + /// A `wgpu_hal` queue is "single threaded": all command buffers are + /// executed in the order they're submitted, with each buffer able to see + /// previous buffers' results. Specifically: /// - /// Within a submission, command buffers execute in the order in which they - /// appear in `command_buffers`. All results produced by one buffer are - /// visible to the next. + /// - If two calls to `submit` on a single `Queue` occur in a particular + /// order (that is, they happen on the same thread, or on two threads that + /// have synchronized to establish an ordering), then the first + /// submission's commands all complete execution before any of the second + /// submission's commands begin. All results produced by one submission + /// are visible to the next. + /// + /// - Within a submission, command buffers execute in the order in which they + /// appear in `command_buffers`. All results produced by one buffer are + /// visible to the next. /// /// If two calls to `submit` on a single `Queue` from different threads are /// not synchronized to occur in a particular order, they must pass distinct @@ -803,10 +864,16 @@ pub trait Queue: WasmNotSendSync { /// - Every [`SurfaceTexture`][st] that any command in `command_buffers` /// writes to must appear in the `surface_textures` argument. /// + /// - No [`SurfaceTexture`][st] may appear in the `surface_textures` + /// argument more than once. + /// /// - Each [`SurfaceTexture`][st] in `surface_textures` must be configured /// for use with the [`Device`][d] associated with this [`Queue`], /// typically by calling [`Surface::configure`]. /// + /// - All calls to this function that include a given [`SurfaceTexture`][st] + /// in `surface_textures` must use the same [`Fence`]. + /// /// [`Fence`]: Api::Fence /// [cb]: Api::CommandBuffer /// [ce]: Api::CommandEncoder @@ -819,7 +886,7 @@ pub trait Queue: WasmNotSendSync { &self, command_buffers: &[&::CommandBuffer], surface_textures: &[&::SurfaceTexture], - signal_fence: Option<(&mut ::Fence, FenceValue)>, + signal_fence: (&mut ::Fence, FenceValue), ) -> Result<(), DeviceError>; unsafe fn present( &self, diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index ce8e01592..1867d7de4 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -377,38 +377,37 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], _surface_textures: &[&SurfaceTexture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { objc::rc::autoreleasepool(|| { - let extra_command_buffer = match signal_fence { - Some((fence, value)) => { - let completed_value = Arc::clone(&fence.completed_value); - let block = block::ConcreteBlock::new(move |_cmd_buf| { - completed_value.store(value, atomic::Ordering::Release); - }) - .copy(); + let extra_command_buffer = { + let completed_value = Arc::clone(&signal_fence.completed_value); + let block = block::ConcreteBlock::new(move |_cmd_buf| { + completed_value.store(signal_value, atomic::Ordering::Release); + }) + .copy(); - let raw = match command_buffers.last() { - Some(&cmd_buf) => cmd_buf.raw.to_owned(), - None => { - let queue = self.raw.lock(); - queue - .new_command_buffer_with_unretained_references() - .to_owned() - } - }; - raw.set_label("(wgpu internal) Signal"); - raw.add_completed_handler(&block); - - fence.maintain(); - fence.pending_command_buffers.push((value, raw.to_owned())); - // only return an extra one if it's extra - match command_buffers.last() { - Some(_) => None, - None => Some(raw), + let raw = match command_buffers.last() { + Some(&cmd_buf) => cmd_buf.raw.to_owned(), + None => { + let queue = self.raw.lock(); + queue + .new_command_buffer_with_unretained_references() + .to_owned() } + }; + raw.set_label("(wgpu internal) Signal"); + raw.add_completed_handler(&block); + + signal_fence.maintain(); + signal_fence + .pending_command_buffers + .push((signal_value, raw.to_owned())); + // only return an extra one if it's extra + match command_buffers.last() { + Some(_) => None, + None => Some(raw), } - None => None, }; for cmd_buffer in command_buffers { diff --git a/wgpu-hal/src/metal/surface.rs b/wgpu-hal/src/metal/surface.rs index e1eb6d5b2..1a1105660 100644 --- a/wgpu-hal/src/metal/surface.rs +++ b/wgpu-hal/src/metal/surface.rs @@ -242,6 +242,7 @@ impl crate::Surface for super::Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let render_layer = self.render_layer.lock(); let (drawable, texture) = match autoreleasepool(|| { diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 6df999084..fe2a6f970 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -3,11 +3,7 @@ use super::conv; use ash::{amd, ext, khr, vk}; use parking_lot::Mutex; -use std::{ - collections::BTreeMap, - ffi::CStr, - sync::{atomic::AtomicIsize, Arc}, -}; +use std::{collections::BTreeMap, ffi::CStr, sync::Arc}; fn depth_stencil_required_flags() -> vk::FormatFeatureFlags { vk::FormatFeatureFlags::SAMPLED_IMAGE | vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT @@ -1783,21 +1779,15 @@ impl super::Adapter { render_passes: Mutex::new(Default::default()), framebuffers: Mutex::new(Default::default()), }); - let mut relay_semaphores = [vk::Semaphore::null(); 2]; - for sem in relay_semaphores.iter_mut() { - unsafe { - *sem = shared - .raw - .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)? - }; - } + + let relay_semaphores = super::RelaySemaphores::new(&shared)?; + let queue = super::Queue { raw: raw_queue, swapchain_fn, device: Arc::clone(&shared), family_index, - relay_semaphores, - relay_index: AtomicIsize::new(-1), + relay_semaphores: Mutex::new(relay_semaphores), }; let mem_allocator = { diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs index 1ea627897..867b7efb2 100644 --- a/wgpu-hal/src/vulkan/device.rs +++ b/wgpu-hal/src/vulkan/device.rs @@ -612,17 +612,16 @@ impl super::Device { let images = unsafe { functor.get_swapchain_images(raw) }.map_err(crate::DeviceError::from)?; - // NOTE: It's important that we define at least images.len() + 1 wait + // NOTE: It's important that we define at least images.len() wait // semaphores, since we prospectively need to provide the call to // acquire the next image with an unsignaled semaphore. - let surface_semaphores = (0..images.len() + 1) - .map(|_| unsafe { - self.shared - .raw - .create_semaphore(&vk::SemaphoreCreateInfo::default(), None) + let surface_semaphores = (0..=images.len()) + .map(|_| { + super::SwapchainImageSemaphores::new(&self.shared) + .map(Mutex::new) + .map(Arc::new) }) - .collect::, _>>() - .map_err(crate::DeviceError::from)?; + .collect::, _>>()?; Ok(super::Swapchain { raw, @@ -633,7 +632,7 @@ impl super::Device { config: config.clone(), view_formats: wgt_view_formats, surface_semaphores, - next_surface_index: 0, + next_semaphore_index: 0, }) } @@ -836,9 +835,12 @@ impl crate::Device for super::Device { unsafe fn exit(self, queue: super::Queue) { unsafe { self.mem_allocator.into_inner().cleanup(&*self.shared) }; unsafe { self.desc_allocator.into_inner().cleanup(&*self.shared) }; - for &sem in queue.relay_semaphores.iter() { - unsafe { self.shared.raw.destroy_semaphore(sem, None) }; - } + unsafe { + queue + .relay_semaphores + .into_inner() + .destroy(&self.shared.raw) + }; unsafe { self.shared.free_resources() }; } @@ -2055,54 +2057,7 @@ impl crate::Device for super::Device { timeout_ms: u32, ) -> Result { let timeout_ns = timeout_ms as u64 * super::MILLIS_TO_NANOS; - match *fence { - super::Fence::TimelineSemaphore(raw) => { - let semaphores = [raw]; - let values = [wait_value]; - let vk_info = vk::SemaphoreWaitInfo::default() - .semaphores(&semaphores) - .values(&values); - let result = match self.shared.extension_fns.timeline_semaphore { - Some(super::ExtensionFn::Extension(ref ext)) => unsafe { - ext.wait_semaphores(&vk_info, timeout_ns) - }, - Some(super::ExtensionFn::Promoted) => unsafe { - self.shared.raw.wait_semaphores(&vk_info, timeout_ns) - }, - None => unreachable!(), - }; - match result { - Ok(()) => Ok(true), - Err(vk::Result::TIMEOUT) => Ok(false), - Err(other) => Err(other.into()), - } - } - super::Fence::FencePool { - last_completed, - ref active, - free: _, - } => { - if wait_value <= last_completed { - Ok(true) - } else { - match active.iter().find(|&&(value, _)| value >= wait_value) { - Some(&(_, raw)) => { - match unsafe { - self.shared.raw.wait_for_fences(&[raw], true, timeout_ns) - } { - Ok(()) => Ok(true), - Err(vk::Result::TIMEOUT) => Ok(false), - Err(other) => Err(other.into()), - } - } - None => { - log::error!("No signals reached value {}", wait_value); - Err(crate::DeviceError::Lost) - } - } - } - } - } + self.shared.wait_for_fence(fence, wait_value, timeout_ns) } unsafe fn start_capture(&self) -> bool { @@ -2364,6 +2319,71 @@ impl crate::Device for super::Device { } } +impl super::DeviceShared { + pub(super) fn new_binary_semaphore(&self) -> Result { + unsafe { + self.raw + .create_semaphore(&vk::SemaphoreCreateInfo::default(), None) + .map_err(crate::DeviceError::from) + } + } + + pub(super) fn wait_for_fence( + &self, + fence: &super::Fence, + wait_value: crate::FenceValue, + timeout_ns: u64, + ) -> Result { + profiling::scope!("Device::wait"); + match *fence { + super::Fence::TimelineSemaphore(raw) => { + let semaphores = [raw]; + let values = [wait_value]; + let vk_info = vk::SemaphoreWaitInfo::default() + .semaphores(&semaphores) + .values(&values); + let result = match self.extension_fns.timeline_semaphore { + Some(super::ExtensionFn::Extension(ref ext)) => unsafe { + ext.wait_semaphores(&vk_info, timeout_ns) + }, + Some(super::ExtensionFn::Promoted) => unsafe { + self.raw.wait_semaphores(&vk_info, timeout_ns) + }, + None => unreachable!(), + }; + match result { + Ok(()) => Ok(true), + Err(vk::Result::TIMEOUT) => Ok(false), + Err(other) => Err(other.into()), + } + } + super::Fence::FencePool { + last_completed, + ref active, + free: _, + } => { + if wait_value <= last_completed { + Ok(true) + } else { + match active.iter().find(|&&(value, _)| value >= wait_value) { + Some(&(_, raw)) => { + match unsafe { self.raw.wait_for_fences(&[raw], true, timeout_ns) } { + Ok(()) => Ok(true), + Err(vk::Result::TIMEOUT) => Ok(false), + Err(other) => Err(other.into()), + } + } + None => { + log::error!("No signals reached value {}", wait_value); + Err(crate::DeviceError::Lost) + } + } + } + } + } + } +} + impl From for crate::DeviceError { fn from(error: gpu_alloc::AllocationError) -> Self { use gpu_alloc::AllocationError as Ae; diff --git a/wgpu-hal/src/vulkan/instance.rs b/wgpu-hal/src/vulkan/instance.rs index 6f471f890..18acaeabb 100644 --- a/wgpu-hal/src/vulkan/instance.rs +++ b/wgpu-hal/src/vulkan/instance.rs @@ -164,10 +164,14 @@ impl super::Swapchain { let _ = unsafe { device.device_wait_idle() }; }; + // We cannot take this by value, as the function returns `self`. for semaphore in self.surface_semaphores.drain(..) { - unsafe { - device.destroy_semaphore(semaphore, None); - } + let arc_removed = Arc::into_inner(semaphore).expect( + "Trying to destroy a SurfaceSemaphores that is still in use by a SurfaceTexture", + ); + let mutex_removed = arc_removed.into_inner(); + + unsafe { mutex_removed.destroy(device) }; } self @@ -966,9 +970,10 @@ impl crate::Surface for super::Surface { unsafe fn acquire_texture( &self, timeout: Option, + fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let mut swapchain = self.swapchain.write(); - let sc = swapchain.as_mut().unwrap(); + let swapchain = swapchain.as_mut().unwrap(); let mut timeout_ns = match timeout { Some(duration) => duration.as_nanos() as u64, @@ -988,12 +993,40 @@ impl crate::Surface for super::Surface { timeout_ns = u64::MAX; } - let wait_semaphore = sc.surface_semaphores[sc.next_surface_index]; + let swapchain_semaphores_arc = swapchain.get_surface_semaphores(); + // Nothing should be using this, so we don't block, but panic if we fail to lock. + let locked_swapchain_semaphores = swapchain_semaphores_arc + .try_lock() + .expect("Failed to lock a SwapchainSemaphores."); + + // Wait for all commands writing to the previously acquired image to + // complete. + // + // Almost all the steps in the usual acquire-draw-present flow are + // asynchronous: they get something started on the presentation engine + // or the GPU, but on the CPU, control returns immediately. Without some + // sort of intervention, the CPU could crank out frames much faster than + // the presentation engine can display them. + // + // This is the intervention: if any submissions drew on this image, and + // thus waited for `locked_swapchain_semaphores.acquire`, wait for all + // of them to finish, thus ensuring that it's okay to pass `acquire` to + // `vkAcquireNextImageKHR` again. + swapchain.device.wait_for_fence( + fence, + locked_swapchain_semaphores.previously_used_submission_index, + timeout_ns, + )?; // will block if no image is available let (index, suboptimal) = match unsafe { - sc.functor - .acquire_next_image(sc.raw, timeout_ns, wait_semaphore, vk::Fence::null()) + profiling::scope!("vkAcquireNextImageKHR"); + swapchain.functor.acquire_next_image( + swapchain.raw, + timeout_ns, + locked_swapchain_semaphores.acquire, + vk::Fence::null(), + ) } { // We treat `VK_SUBOPTIMAL_KHR` as `VK_SUCCESS` on Android. // See the comment in `Queue::present`. @@ -1013,16 +1046,18 @@ impl crate::Surface for super::Surface { } }; - sc.next_surface_index += 1; - sc.next_surface_index %= sc.surface_semaphores.len(); + drop(locked_swapchain_semaphores); + // We only advance the surface semaphores if we successfully acquired an image, otherwise + // we should try to re-acquire using the same semaphores. + swapchain.advance_surface_semaphores(); // special case for Intel Vulkan returning bizarre values (ugh) - if sc.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 { + if swapchain.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 { return Err(crate::SurfaceError::Outdated); } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-framebuffer-03209 - let raw_flags = if sc + let raw_flags = if swapchain .raw_flags .contains(vk::SwapchainCreateFlagsKHR::MUTABLE_FORMAT) { @@ -1034,20 +1069,20 @@ impl crate::Surface for super::Surface { let texture = super::SurfaceTexture { index, texture: super::Texture { - raw: sc.images[index as usize], + raw: swapchain.images[index as usize], drop_guard: None, block: None, - usage: sc.config.usage, - format: sc.config.format, + usage: swapchain.config.usage, + format: swapchain.config.format, raw_flags, copy_size: crate::CopyExtent { - width: sc.config.extent.width, - height: sc.config.extent.height, + width: swapchain.config.extent.width, + height: swapchain.config.extent.height, depth: 1, }, - view_formats: sc.view_formats.clone(), + view_formats: swapchain.view_formats.clone(), }, - wait_semaphore, + surface_semaphores: swapchain_semaphores_arc, }; Ok(Some(crate::AcquiredSurfaceTexture { texture, diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index 1716ee920..40e7a2cb4 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -33,13 +33,11 @@ mod instance; use std::{ borrow::Borrow, + collections::HashSet, ffi::{CStr, CString}, - fmt, + fmt, mem, num::NonZeroU32, - sync::{ - atomic::{AtomicIsize, Ordering}, - Arc, - }, + sync::Arc, }; use arrayvec::ArrayVec; @@ -147,6 +145,173 @@ pub struct Instance { shared: Arc, } +/// The semaphores needed to use one image in a swapchain. +#[derive(Debug)] +struct SwapchainImageSemaphores { + /// A semaphore that is signaled when this image is safe for us to modify. + /// + /// When [`vkAcquireNextImageKHR`] returns the index of the next swapchain + /// image that we should use, that image may actually still be in use by the + /// presentation engine, and is not yet safe to modify. However, that + /// function does accept a semaphore that it will signal when the image is + /// indeed safe to begin messing with. + /// + /// This semaphore is: + /// + /// - waited for by the first queue submission to operate on this image + /// since it was acquired, and + /// + /// - signaled by [`vkAcquireNextImageKHR`] when the acquired image is ready + /// for us to use. + /// + /// [`vkAcquireNextImageKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkAcquireNextImageKHR + acquire: vk::Semaphore, + + /// True if the next command submission operating on this image should wait + /// for [`acquire`]. + /// + /// We must wait for `acquire` before drawing to this swapchain image, but + /// because `wgpu-hal` queue submissions are always strongly ordered, only + /// the first submission that works with a swapchain image actually needs to + /// wait. We set this flag when this image is acquired, and clear it the + /// first time it's passed to [`Queue::submit`] as a surface texture. + /// + /// [`acquire`]: SwapchainImageSemaphores::acquire + /// [`Queue::submit`]: crate::Queue::submit + should_wait_for_acquire: bool, + + /// A pool of semaphores for ordering presentation after drawing. + /// + /// The first [`present_index`] semaphores in this vector are: + /// + /// - all waited on by the call to [`vkQueuePresentKHR`] that presents this + /// image, and + /// + /// - each signaled by some [`vkQueueSubmit`] queue submission that draws to + /// this image, when the submission finishes execution. + /// + /// This vector accumulates one semaphore per submission that writes to this + /// image. This is awkward, but hard to avoid: [`vkQueuePresentKHR`] + /// requires a semaphore to order it with respect to drawing commands, and + /// we can't attach new completion semaphores to a command submission after + /// it's been submitted. This means that, at submission time, we must create + /// the semaphore we might need if the caller's next action is to enqueue a + /// presentation of this image. + /// + /// An alternative strategy would be for presentation to enqueue an empty + /// submit, ordered relative to other submits in the usual way, and + /// signaling a single presentation semaphore. But we suspect that submits + /// are usually expensive enough, and semaphores usually cheap enough, that + /// performance-sensitive users will avoid making many submits, so that the + /// cost of accumulated semaphores will usually be less than the cost of an + /// additional submit. + /// + /// Only the first [`present_index`] semaphores in the vector are actually + /// going to be signalled by submitted commands, and need to be waited for + /// by the next present call. Any semaphores beyond that index were created + /// for prior presents and are simply being retained for recycling. + /// + /// [`present_index`]: SwapchainImageSemaphores::present_index + /// [`vkQueuePresentKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueuePresentKHR + /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit + present: Vec, + + /// The number of semaphores in [`present`] to be signalled for this submission. + /// + /// [`present`]: SwapchainImageSemaphores::present + present_index: usize, + + /// The fence value of the last command submission that wrote to this image. + /// + /// The next time we try to acquire this image, we'll block until + /// this submission finishes, proving that [`acquire`] is ready to + /// pass to `vkAcquireNextImageKHR` again. + /// + /// [`acquire`]: SwapchainImageSemaphores::acquire + previously_used_submission_index: crate::FenceValue, +} + +impl SwapchainImageSemaphores { + fn new(device: &DeviceShared) -> Result { + Ok(Self { + acquire: device.new_binary_semaphore()?, + should_wait_for_acquire: true, + present: Vec::new(), + present_index: 0, + previously_used_submission_index: 0, + }) + } + + fn set_used_fence_value(&mut self, value: crate::FenceValue) { + self.previously_used_submission_index = value; + } + + /// Return the semaphore that commands drawing to this image should wait for, if any. + /// + /// This only returns `Some` once per acquisition; see + /// [`SwapchainImageSemaphores::should_wait_for_acquire`] for details. + fn get_acquire_wait_semaphore(&mut self) -> Option { + if self.should_wait_for_acquire { + self.should_wait_for_acquire = false; + Some(self.acquire) + } else { + None + } + } + + /// Return a semaphore that a submission that writes to this image should + /// signal when it's done. + /// + /// See [`SwapchainImageSemaphores::present`] for details. + fn get_submit_signal_semaphore( + &mut self, + device: &DeviceShared, + ) -> Result { + // Try to recycle a semaphore we created for a previous presentation. + let sem = match self.present.get(self.present_index) { + Some(sem) => *sem, + None => { + let sem = device.new_binary_semaphore()?; + self.present.push(sem); + sem + } + }; + + self.present_index += 1; + + Ok(sem) + } + + /// Return the semaphores that a presentation of this image should wait on. + /// + /// Return a slice of semaphores that the call to [`vkQueueSubmit`] that + /// ends this image's acquisition should wait for. See + /// [`SwapchainImageSemaphores::present`] for details. + /// + /// Reset `self` to be ready for the next acquisition cycle. + /// + /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit + fn get_present_wait_semaphores(&mut self) -> &[vk::Semaphore] { + let old_index = self.present_index; + + // Since this marks the end of this acquire/draw/present cycle, take the + // opportunity to reset `self` in preparation for the next acquisition. + self.present_index = 0; + self.should_wait_for_acquire = true; + + &self.present[0..old_index] + } + + unsafe fn destroy(&self, device: &ash::Device) { + unsafe { + device.destroy_semaphore(self.acquire, None); + for sem in &self.present { + device.destroy_semaphore(*sem, None); + } + } + } +} + struct Swapchain { raw: vk::SwapchainKHR, raw_flags: vk::SwapchainCreateFlagsKHR, @@ -157,9 +322,25 @@ struct Swapchain { view_formats: Vec, /// One wait semaphore per swapchain image. This will be associated with the /// surface texture, and later collected during submission. - surface_semaphores: Vec, - /// Current semaphore index to use when acquiring a surface. - next_surface_index: usize, + /// + /// We need this to be `Arc>` because we need to be able to pass this + /// data into the surface texture, so submit/present can use it. + surface_semaphores: Vec>>, + /// The index of the next semaphore to use. Ideally we would use the same + /// index as the image index, but we need to specify the semaphore as an argument + /// to the acquire_next_image function which is what tells us which image to use. + next_semaphore_index: usize, +} + +impl Swapchain { + fn advance_surface_semaphores(&mut self) { + let semaphore_count = self.surface_semaphores.len(); + self.next_semaphore_index = (self.next_semaphore_index + 1) % semaphore_count; + } + + fn get_surface_semaphores(&self) -> Arc> { + self.surface_semaphores[self.next_semaphore_index].clone() + } } pub struct Surface { @@ -173,7 +354,7 @@ pub struct Surface { pub struct SurfaceTexture { index: u32, texture: Texture, - wait_semaphore: vk::Semaphore, + surface_semaphores: Arc>, } impl Borrow for SurfaceTexture { @@ -359,18 +540,87 @@ pub struct Device { render_doc: crate::auxil::renderdoc::RenderDoc, } +/// Semaphores for forcing queue submissions to run in order. +/// +/// The [`wgpu_hal::Queue`] trait promises that if two calls to [`submit`] are +/// ordered, then the first submission will finish on the GPU before the second +/// submission begins. To get this behavior on Vulkan we need to pass semaphores +/// to [`vkQueueSubmit`] for the commands to wait on before beginning execution, +/// and to signal when their execution is done. +/// +/// Normally this can be done with a single semaphore, waited on and then +/// signalled for each submission. At any given time there's exactly one +/// submission that would signal the semaphore, and exactly one waiting on it, +/// as Vulkan requires. +/// +/// However, as of Oct 2021, bug [#5508] in the Mesa ANV drivers caused them to +/// hang if we use a single semaphore. The workaround is to alternate between +/// two semaphores. The bug has been fixed in Mesa, but we should probably keep +/// the workaround until, say, Oct 2026. +/// +/// [`wgpu_hal::Queue`]: crate::Queue +/// [`submit`]: crate::Queue::submit +/// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit +/// [#5508]: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508 +#[derive(Clone)] +struct RelaySemaphores { + /// The semaphore the next submission should wait on before beginning + /// execution on the GPU. This is `None` for the first submission, which + /// should not wait on anything at all. + wait: Option, + + /// The semaphore the next submission should signal when it has finished + /// execution on the GPU. + signal: vk::Semaphore, +} + +impl RelaySemaphores { + fn new(device: &DeviceShared) -> Result { + Ok(Self { + wait: None, + signal: device.new_binary_semaphore()?, + }) + } + + /// Advances the semaphores, returning the semaphores that should be used for a submission. + fn advance(&mut self, device: &DeviceShared) -> Result { + let old = self.clone(); + + // Build the state for the next submission. + match self.wait { + None => { + // The `old` values describe the first submission to this queue. + // The second submission should wait on `old.signal`, and then + // signal a new semaphore which we'll create now. + self.wait = Some(old.signal); + self.signal = device.new_binary_semaphore()?; + } + Some(ref mut wait) => { + // What this submission signals, the next should wait. + mem::swap(wait, &mut self.signal); + } + }; + + Ok(old) + } + + /// Destroys the semaphores. + unsafe fn destroy(&self, device: &ash::Device) { + unsafe { + if let Some(wait) = self.wait { + device.destroy_semaphore(wait, None); + } + device.destroy_semaphore(self.signal, None); + } + } +} + pub struct Queue { raw: vk::Queue, swapchain_fn: khr::swapchain::Device, device: Arc, family_index: u32, - /// We use a redundant chain of semaphores to pass on the signal - /// from submissions to the last present, since it's required by the - /// specification. - /// It would be correct to use a single semaphore there, but - /// [Intel hangs in `anv_queue_finish`](https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508). - relay_semaphores: [vk::Semaphore; 2], - relay_index: AtomicIsize, + relay_semaphores: Mutex, } #[derive(Debug)] @@ -702,58 +952,89 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], surface_textures: &[&SurfaceTexture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let mut fence_raw = vk::Fence::null(); let mut wait_stage_masks = Vec::new(); let mut wait_semaphores = Vec::new(); - let mut signal_semaphores = ArrayVec::<_, 2>::new(); - let mut signal_values = ArrayVec::<_, 2>::new(); + let mut signal_semaphores = Vec::new(); + let mut signal_values = Vec::new(); - for &surface_texture in surface_textures { - wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); - wait_semaphores.push(surface_texture.wait_semaphore); + // Double check that the same swapchain image isn't being given to us multiple times, + // as that will deadlock when we try to lock them all. + debug_assert!( + { + let mut check = HashSet::with_capacity(surface_textures.len()); + // We compare the Arcs by pointer, as Eq isn't well defined for SurfaceSemaphores. + for st in surface_textures { + check.insert(Arc::as_ptr(&st.surface_semaphores)); + } + check.len() == surface_textures.len() + }, + "More than one surface texture is being used from the same swapchain. This will cause a deadlock in release." + ); + + let locked_swapchain_semaphores = surface_textures + .iter() + .map(|st| { + st.surface_semaphores + .try_lock() + .expect("Failed to lock surface semaphore.") + }) + .collect::>(); + + for mut swapchain_semaphore in locked_swapchain_semaphores { + swapchain_semaphore.set_used_fence_value(signal_value); + + // If we're the first submission to operate on this image, wait on + // its acquire semaphore, to make sure the presentation engine is + // done with it. + if let Some(sem) = swapchain_semaphore.get_acquire_wait_semaphore() { + wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); + wait_semaphores.push(sem); + } + + // Get a semaphore to signal when we're done writing to this surface + // image. Presentation of this image will wait for this. + let signal_semaphore = swapchain_semaphore.get_submit_signal_semaphore(&self.device)?; + signal_semaphores.push(signal_semaphore); + signal_values.push(!0); } - let old_index = self.relay_index.load(Ordering::Relaxed); + // In order for submissions to be strictly ordered, we encode a dependency between each submission + // using a pair of semaphores. This adds a wait if it is needed, and signals the next semaphore. + let semaphore_state = self.relay_semaphores.lock().advance(&self.device)?; - let sem_index = if old_index >= 0 { + if let Some(sem) = semaphore_state.wait { wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); - wait_semaphores.push(self.relay_semaphores[old_index as usize]); - (old_index as usize + 1) % self.relay_semaphores.len() - } else { - 0 - }; + wait_semaphores.push(sem); + } - signal_semaphores.push(self.relay_semaphores[sem_index]); + signal_semaphores.push(semaphore_state.signal); + signal_values.push(!0); - self.relay_index - .store(sem_index as isize, Ordering::Relaxed); - - if let Some((fence, value)) = signal_fence { - fence.maintain(&self.device.raw)?; - match *fence { - Fence::TimelineSemaphore(raw) => { - signal_semaphores.push(raw); - signal_values.push(!0); - signal_values.push(value); - } - Fence::FencePool { - ref mut active, - ref mut free, - .. - } => { - fence_raw = match free.pop() { - Some(raw) => raw, - None => unsafe { - self.device - .raw - .create_fence(&vk::FenceCreateInfo::default(), None)? - }, - }; - active.push((value, fence_raw)); - } + // We need to signal our wgpu::Fence if we have one, this adds it to the signal list. + signal_fence.maintain(&self.device.raw)?; + match *signal_fence { + Fence::TimelineSemaphore(raw) => { + signal_semaphores.push(raw); + signal_values.push(signal_value); + } + Fence::FencePool { + ref mut active, + ref mut free, + .. + } => { + fence_raw = match free.pop() { + Some(raw) => raw, + None => unsafe { + self.device + .raw + .create_fence(&vk::FenceCreateInfo::default(), None)? + }, + }; + active.push((signal_value, fence_raw)); } } @@ -771,7 +1052,7 @@ impl crate::Queue for Queue { let mut vk_timeline_info; - if !signal_values.is_empty() { + if self.device.private_caps.timeline_semaphores { vk_timeline_info = vk::TimelineSemaphoreSubmitInfo::default().signal_semaphore_values(&signal_values); vk_info = vk_info.push_next(&mut vk_timeline_info); @@ -793,19 +1074,14 @@ impl crate::Queue for Queue { ) -> Result<(), crate::SurfaceError> { let mut swapchain = surface.swapchain.write(); let ssc = swapchain.as_mut().unwrap(); + let mut swapchain_semaphores = texture.surface_semaphores.lock(); let swapchains = [ssc.raw]; let image_indices = [texture.index]; - let mut vk_info = vk::PresentInfoKHR::default() + let vk_info = vk::PresentInfoKHR::default() .swapchains(&swapchains) - .image_indices(&image_indices); - - let old_index = self.relay_index.swap(-1, Ordering::Relaxed); - if old_index >= 0 { - vk_info = vk_info.wait_semaphores( - &self.relay_semaphores[old_index as usize..old_index as usize + 1], - ); - } + .image_indices(&image_indices) + .wait_semaphores(swapchain_semaphores.get_present_wait_semaphores()); let suboptimal = { profiling::scope!("vkQueuePresentKHR");