[core] Make poll(Wait) not hang after bad command submission.

Add `wgpu_core::device::Device::last_successful_submission_index`, which records the fence value that `Maintain::Wait` should actually wait for. See comments for details. Fixes #5969.
2024-11-25 00:03:29 +00:00 · 2024-07-16 20:43:33 -07:00 · 2024-07-16 20:43:33 -07:00 · aeb2067e81
commit aeb2067e81
parent 2bc328c46f
3 changed files with 90 additions and 24 deletions
--- a/tests/tests/poll.rs
+++ b/tests/tests/poll.rs
@ -125,3 +125,37 @@ static WAIT_OUT_OF_ORDER: GpuTestConfiguration =
            .await
            .panic_on_timeout();
    });
+
+/// Submit a command buffer to the wrong device. A wait poll shouldn't hang.
+///
+/// We can't catch panics on Wasm, since they get reported directly to the
+/// console.
+#[gpu_test]
+static WAIT_AFTER_BAD_SUBMISSION: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(wgpu_test::TestParameters::default().skip(wgpu_test::FailureCase::webgl2()))
+    .run_async(wait_after_bad_submission);
+
+async fn wait_after_bad_submission(ctx: TestingContext) {
+    let (device2, queue2) =
+        wgpu_test::initialize_device(&ctx.adapter, ctx.device_features, ctx.device_limits.clone())
+            .await;
+
+    let command_buffer1 = ctx
+        .device
+        .create_command_encoder(&CommandEncoderDescriptor::default())
+        .finish();
+
+    // This should panic, since the command buffer belongs to the wrong
+    // device, and queue submission errors seem to be fatal errors?
+    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+        queue2.submit([command_buffer1]);
+    }));
+    assert!(result.is_err());
+
+    // This should not hang.
+    //
+    // Specifically, the failed submission should not cause a new fence value to
+    // be allocated that will not be signalled until further work is
+    // successfully submitted, causing a greater fence value to be signalled.
+    device2.poll(wgpu::Maintain::Wait);
+}
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@ -1069,7 +1069,7 @@ impl Global {
            let fence = fence_guard.as_mut().unwrap();
            let submit_index = device
                .active_submission_index
-                .fetch_add(1, Ordering::Relaxed)
+                .fetch_add(1, Ordering::SeqCst)
                + 1;
            let mut active_executions = Vec::new();

@ -1392,6 +1392,11 @@ impl Global {
                        )
                        .map_err(DeviceError::from)?;
                }
+
+                // Advance the successful submission index.
+                device
+                    .last_successful_submission_index
+                    .fetch_max(submit_index, Ordering::SeqCst);
            }

            profiling::scope!("cleanup");
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@ -88,7 +88,27 @@ pub struct Device<A: HalApi> {
    label: String,

    pub(crate) command_allocator: command::CommandAllocator<A>,
+
+    /// The index of the last command submission that was attempted.
+    ///
+    /// Note that `fence` may never be signalled with this value, if the command
+    /// submission failed. If you need to wait for everything running on a
+    /// `Queue` to complete, wait for [`last_successful_submission_index`].
+    ///
+    /// [`last_successful_submission_index`]: Device::last_successful_submission_index
    pub(crate) active_submission_index: hal::AtomicFenceValue,
+
+    /// The index of the last successful submission to this device's
+    /// [`hal::Queue`].
+    ///
+    /// Unlike [`active_submission_index`], which is incremented each time
+    /// submission is attempted, this is updated only when submission succeeds,
+    /// so waiting for this value won't hang waiting for work that was never
+    /// submitted.
+    ///
+    /// [`active_submission_index`]: Device::active_submission_index
+    pub(crate) last_successful_submission_index: hal::AtomicFenceValue,
+
    // NOTE: if both are needed, the `snatchable_lock` must be consistently acquired before the
    // `fence` lock to avoid deadlocks.
    pub(crate) fence: RwLock<Option<A::Fence>>,
@ -257,6 +277,7 @@ impl<A: HalApi> Device<A> {
            label: desc.label.to_string(),
            command_allocator,
            active_submission_index: AtomicU64::new(0),
+            last_successful_submission_index: AtomicU64::new(0),
            fence: RwLock::new(rank::DEVICE_FENCE, Some(fence)),
            snatchable_lock: unsafe { SnatchLock::new(rank::DEVICE_SNATCHABLE_LOCK) },
            valid: AtomicBool::new(true),
@ -387,37 +408,41 @@ impl<A: HalApi> Device<A> {
        profiling::scope!("Device::maintain");

        let fence = fence_guard.as_ref().unwrap();
-        let last_done_index = if maintain.is_wait() {
-            let index_to_wait_for = match maintain {
-                wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
-                    // We don't need to check to see if the queue id matches
-                    // as we already checked this from inside the poll call.
-                    submission_index.index
-                }
-                _ => self.active_submission_index.load(Ordering::Relaxed),
-            };
-            unsafe {
-                self.raw
-                    .as_ref()
-                    .unwrap()
-                    .wait(fence, index_to_wait_for, CLEANUP_WAIT_MS)
-                    .map_err(DeviceError::from)?
-            };
-            index_to_wait_for
-        } else {
-            unsafe {
+
+        // Determine which submission index `maintain` represents.
+        let submission_index = match maintain {
+            wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
+                // We don't need to check to see if the queue id matches
+                // as we already checked this from inside the poll call.
+                submission_index.index
+            }
+            wgt::Maintain::Wait => self
+                .last_successful_submission_index
+                .load(Ordering::Acquire),
+            wgt::Maintain::Poll => unsafe {
                self.raw
                    .as_ref()
                    .unwrap()
                    .get_fence_value(fence)
                    .map_err(DeviceError::from)?
-            }
+            },
        };
-        log::info!("Device::maintain: last done index {last_done_index}");
+
+        // If necessary, wait for that submission to complete.
+        if maintain.is_wait() {
+            unsafe {
+                self.raw
+                    .as_ref()
+                    .unwrap()
+                    .wait(fence, submission_index, CLEANUP_WAIT_MS)
+                    .map_err(DeviceError::from)?
+            };
+        }
+        log::info!("Device::maintain: waiting for submission index {submission_index}");

        let mut life_tracker = self.lock_life();
        let submission_closures =
-            life_tracker.triage_submissions(last_done_index, &self.command_allocator);
+            life_tracker.triage_submissions(submission_index, &self.command_allocator);

        life_tracker.triage_mapped();

@ -3586,7 +3611,9 @@ impl<A: HalApi> Device<A> {
    /// Wait for idle and remove resources that we can, before we die.
    pub(crate) fn prepare_to_die(&self) {
        self.pending_writes.lock().as_mut().unwrap().deactivate();
-        let current_index = self.active_submission_index.load(Ordering::Relaxed);
+        let current_index = self
+            .last_successful_submission_index
+            .load(Ordering::Acquire);
        if let Err(error) = unsafe {
            let fence = self.fence.read();
            let fence = fence.as_ref().unwrap();