From fc2fd95a988eaba59d561939971d7f516a4d5b7f Mon Sep 17 00:00:00 2001
From: Erich Gubler <erichdongubler@gmail.com>
Date: Tue, 24 Sep 2024 22:52:25 -0400
Subject: [PATCH] fix: handle `Queue::submit` non-fatally (#6318)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Change the signature of `wgpu_core::Global::queue_submit` to return
  a `(SubmissionIndex, …)` in addition to its current error type.
* Change the control flow of errors in `Queue::submit` to break to the
  end of a block. This is similar to what we already do in many APIs in
  `wgpu_core`.
* Hoist the scope of the local `submit_index` binding so it can be used
  at the point where we need to convert current error paths to also
  return the submission index.

Later, we will likely want to avoid actually retrieving a new submission
index so we can minimize the critical section of code. We'll need to
figure out a strategy for returning a valid (but not necessarily unique)
index in the case of failures that prevent successful submission.
---
 deno_webgpu/queue.rs                 |  2 +-
 tests/tests/regression/issue_6317.rs | 58 ++++++++++++++++++++++
 tests/tests/root.rs                  |  1 +
 wgpu-core/src/device/queue.rs        | 73 +++++++++++++++++++---------
 wgpu/src/backend/wgpu_core.rs        |  5 +-
 5 files changed, 115 insertions(+), 24 deletions(-)
 create mode 100644 tests/tests/regression/issue_6317.rs

diff --git a/deno_webgpu/queue.rs b/deno_webgpu/queue.rs
index fdbf993f8..5915b68f2 100644
--- a/deno_webgpu/queue.rs
+++ b/deno_webgpu/queue.rs
@@ -44,7 +44,7 @@ pub fn op_webgpu_queue_submit(
         })
         .collect::<Result<Vec<_>, AnyError>>()?;
 
-    let maybe_err = instance.queue_submit(queue, &ids).err();
+    let maybe_err = instance.queue_submit(queue, &ids).err().map(|(_idx, e)| e);
 
     for rid in command_buffers {
         let resource = state.resource_table.take::<WebGpuCommandBuffer>(rid)?;
diff --git a/tests/tests/regression/issue_6317.rs b/tests/tests/regression/issue_6317.rs
new file mode 100644
index 000000000..20945006f
--- /dev/null
+++ b/tests/tests/regression/issue_6317.rs
@@ -0,0 +1,58 @@
+use wgpu::{DownlevelFlags, Limits};
+use wgpu_macros::gpu_test;
+use wgpu_test::{fail, GpuTestConfiguration, TestParameters};
+
+#[gpu_test]
+static NON_FATAL_ERRORS_IN_QUEUE_SUBMIT: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS)
+            .limits(Limits::downlevel_defaults()),
+    )
+    .run_sync(|ctx| {
+        let shader_with_trivial_bind_group = concat!(
+            "@group(0) @binding(0) var<storage, read_write> stuff: u32;\n",
+            "\n",
+            "@compute @workgroup_size(1) fn main() { stuff = 2u; }\n"
+        );
+
+        let module = ctx
+            .device
+            .create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: None,
+                source: wgpu::ShaderSource::Wgsl(shader_with_trivial_bind_group.into()),
+            });
+
+        let compute_pipeline =
+            ctx.device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: None,
+                    layout: None,
+                    module: &module,
+                    entry_point: None,
+                    compilation_options: Default::default(),
+                    cache: Default::default(),
+                });
+
+        fail(
+            &ctx.device,
+            || {
+                let mut command_encoder = ctx.device.create_command_encoder(&Default::default());
+                {
+                    let mut render_pass = command_encoder.begin_compute_pass(&Default::default());
+                    render_pass.set_pipeline(&compute_pipeline);
+
+                    // NOTE: We deliberately don't set a bind group here, to provoke a validation
+                    // error.
+
+                    render_pass.dispatch_workgroups(1, 1, 1);
+                }
+
+                let _idx = ctx.queue.submit([command_encoder.finish()]);
+            },
+            Some(concat!(
+                "The current set ComputePipeline with '' label ",
+                "expects a BindGroup to be set at index 0"
+            )),
+        )
+    });
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index df0dce5fe..3bb8e14a9 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -6,6 +6,7 @@ mod regression {
     mod issue_4485;
     mod issue_4514;
     mod issue_5553;
+    mod issue_6317;
 }
 
 mod bgra8unorm_storage;
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index f576b2412..bd6d99f1c 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1027,11 +1027,13 @@ impl Global {
         &self,
         queue_id: QueueId,
         command_buffer_ids: &[id::CommandBufferId],
-    ) -> Result<SubmissionIndex, QueueSubmitError> {
+    ) -> Result<SubmissionIndex, (SubmissionIndex, QueueSubmitError)> {
         profiling::scope!("Queue::submit");
         api_log!("Queue::submit {queue_id:?}");
 
-        let (submit_index, callbacks) = {
+        let submit_index;
+
+        let res = 'error: {
             let hub = &self.hub;
 
             let queue = hub.queues.get(queue_id);
@@ -1042,7 +1044,7 @@ impl Global {
 
             // Fence lock must be acquired after the snatch lock everywhere to avoid deadlocks.
             let mut fence = device.fence.write();
-            let submit_index = device
+            submit_index = device
                 .active_submission_index
                 .fetch_add(1, Ordering::SeqCst)
                 + 1;
@@ -1119,18 +1121,29 @@ impl Global {
                         }
 
                         // execute resource transitions
-                        unsafe {
+                        if let Err(e) = unsafe {
                             baked.encoder.begin_encoding(hal_label(
                                 Some("(wgpu internal) Transit"),
                                 device.instance_flags,
                             ))
                         }
-                        .map_err(|e| device.handle_hal_error(e))?;
+                        .map_err(|e| device.handle_hal_error(e))
+                        {
+                            break 'error Err(e.into());
+                        }
 
                         //Note: locking the trackers has to be done after the storages
                         let mut trackers = device.trackers.lock();
-                        baked.initialize_buffer_memory(&mut trackers, &snatch_guard)?;
-                        baked.initialize_texture_memory(&mut trackers, device, &snatch_guard)?;
+                        if let Err(e) = baked.initialize_buffer_memory(&mut trackers, &snatch_guard)
+                        {
+                            break 'error Err(e.into());
+                        }
+                        if let Err(e) =
+                            baked.initialize_texture_memory(&mut trackers, device, &snatch_guard)
+                        {
+                            break 'error Err(e.into());
+                        }
+
                         //Note: stateless trackers are not merged:
                         // device already knows these resources exist.
                         CommandBuffer::insert_barriers_from_device_tracker(
@@ -1147,13 +1160,16 @@ impl Global {
                         // Note: we could technically do it after all of the command buffers,
                         // but here we have a command encoder by hand, so it's easier to use it.
                         if !used_surface_textures.is_empty() {
-                            unsafe {
+                            if let Err(e) = unsafe {
                                 baked.encoder.begin_encoding(hal_label(
                                     Some("(wgpu internal) Present"),
                                     device.instance_flags,
                                 ))
                             }
-                            .map_err(|e| device.handle_hal_error(e))?;
+                            .map_err(|e| device.handle_hal_error(e))
+                            {
+                                break 'error Err(e.into());
+                            }
                             let texture_barriers = trackers
                                 .textures
                                 .set_from_usage_scope_and_drain_transitions(
@@ -1180,7 +1196,7 @@ impl Global {
                     }
 
                     if let Some(first_error) = first_error {
-                        return Err(first_error);
+                        break 'error Err(first_error);
                     }
                 }
             }
@@ -1190,9 +1206,9 @@ impl Global {
             {
                 used_surface_textures.set_size(hub.textures.read().len());
                 for texture in pending_writes.dst_textures.values() {
-                    match texture.try_inner(&snatch_guard)? {
-                        TextureInner::Native { .. } => {}
-                        TextureInner::Surface { .. } => {
+                    match texture.try_inner(&snatch_guard) {
+                        Ok(TextureInner::Native { .. }) => {}
+                        Ok(TextureInner::Surface { .. }) => {
                             // Compare the Arcs by pointer as Textures don't implement Eq
                             submit_surface_textures_owned
                                 .insert(Arc::as_ptr(texture), texture.clone());
@@ -1203,6 +1219,7 @@ impl Global {
                                     .unwrap()
                             };
                         }
+                        Err(e) => break 'error Err(e.into()),
                     }
                 }
 
@@ -1224,10 +1241,12 @@ impl Global {
                 }
             }
 
-            if let Some(pending_execution) =
-                pending_writes.pre_submit(&device.command_allocator, device, &queue)?
-            {
-                active_executions.insert(0, pending_execution);
+            match pending_writes.pre_submit(&device.command_allocator, device, &queue) {
+                Ok(Some(pending_execution)) => {
+                    active_executions.insert(0, pending_execution);
+                }
+                Ok(None) => {}
+                Err(e) => break 'error Err(e.into()),
             }
 
             let hal_command_buffers = active_executions
@@ -1249,14 +1268,17 @@ impl Global {
                     submit_surface_textures.push(raw);
                 }
 
-                unsafe {
+                if let Err(e) = unsafe {
                     queue.raw().submit(
                         &hal_command_buffers,
                         &submit_surface_textures,
                         (fence.as_mut(), submit_index),
                     )
                 }
-                .map_err(|e| device.handle_hal_error(e))?;
+                .map_err(|e| device.handle_hal_error(e))
+                {
+                    break 'error Err(e.into());
+                }
 
                 // Advance the successful submission index.
                 device
@@ -1280,12 +1302,19 @@ impl Global {
             let (closures, _) =
                 match device.maintain(fence_guard, wgt::Maintain::Poll, snatch_guard) {
                     Ok(closures) => closures,
-                    Err(WaitIdleError::Device(err)) => return Err(QueueSubmitError::Queue(err)),
-                    Err(WaitIdleError::StuckGpu) => return Err(QueueSubmitError::StuckGpu),
+                    Err(WaitIdleError::Device(err)) => {
+                        break 'error Err(QueueSubmitError::Queue(err))
+                    }
+                    Err(WaitIdleError::StuckGpu) => break 'error Err(QueueSubmitError::StuckGpu),
                     Err(WaitIdleError::WrongSubmissionIndex(..)) => unreachable!(),
                 };
 
-            (submit_index, closures)
+            Ok(closures)
+        };
+
+        let callbacks = match res {
+            Ok(ok) => ok,
+            Err(e) => return Err((submit_index, e)),
         };
 
         // the closures should execute with nothing locked!
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 3aac20e21..1d1ffda20 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -2074,7 +2074,10 @@ impl crate::Context for ContextWgpuCore {
 
         let index = match self.0.queue_submit(queue_data.id, &temp_command_buffers) {
             Ok(index) => index,
-            Err(err) => self.handle_error_fatal(err, "Queue::submit"),
+            Err((index, err)) => {
+                self.handle_error_nolabel(&queue_data.error_sink, err, "Queue::submit");
+                index
+            }
         };
 
         for cmdbuf in &temp_command_buffers {