add a retry mechanism for waiting on the last submission in Queue::drop

This commit is contained in:
teoxoy 2024-11-07 17:24:40 +01:00 committed by Teodor Tanasoaia
parent d489e4c2e8
commit 5a3de2d3a8

View File

@ -149,37 +149,75 @@ impl Drop for Queue {
.load(Ordering::Acquire); .load(Ordering::Acquire);
let fence = self.device.fence.read(); let fence = self.device.fence.read();
let wait_res = unsafe {
self.device.raw().wait(
fence.as_ref(),
last_successful_submission_index,
#[cfg(not(target_arch = "wasm32"))]
crate::device::CLEANUP_WAIT_MS,
#[cfg(target_arch = "wasm32")]
0, // WebKit and Chromium don't support a non-0 timeout
)
};
drop(fence);
match wait_res { // Try waiting on the last submission using the following sequence of timeouts
Ok(true) => {} let timeouts_in_ms = [100, 200, 400, 800, 1600, 3200];
// Note: If we don't panic here we are in UB land (destroying resources while they are still in use by the GPU).
Ok(false) => { for (i, timeout_ms) in timeouts_in_ms.into_iter().enumerate() {
// It's fine that we timed out on WebGL; GL objects can be deleted early as they let is_last_iter = i == timeouts_in_ms.len() - 1;
// will be kept around by the driver if GPU work hasn't finished.
// Moreover, the way we emulate read mappings on WebGL allows us to execute map_buffer earlier than on other api_log!(
// backends since getBufferSubData is synchronous with respect to the other previously enqueued GL commands. "Waiting on last submission. try: {}/{}. timeout: {}ms",
// Relying on this behavior breaks the clean abstraction wgpu-hal tries to maintain and i + 1,
// we should find ways to improve this. See https://github.com/gfx-rs/wgpu/issues/6538. timeouts_in_ms.len(),
#[cfg(not(target_arch = "wasm32"))] timeout_ms
panic!("We timed out while waiting on the last successful submission to complete!"); );
}
Err(e) => { let wait_res = unsafe {
panic!( self.device.raw().wait(
"We ran into an error while waiting on the last successful submission to complete! - {e}" fence.as_ref(),
); last_successful_submission_index,
#[cfg(not(target_arch = "wasm32"))]
timeout_ms,
#[cfg(target_arch = "wasm32")]
0, // WebKit and Chromium don't support a non-0 timeout
)
};
// Note: If we don't panic below we are in UB land (destroying resources while they are still in use by the GPU).
match wait_res {
Ok(true) => break,
Ok(false) => {
// It's fine that we timed out on WebGL; GL objects can be deleted early as they
// will be kept around by the driver if GPU work hasn't finished.
// Moreover, the way we emulate read mappings on WebGL allows us to execute map_buffer earlier than on other
// backends since getBufferSubData is synchronous with respect to the other previously enqueued GL commands.
// Relying on this behavior breaks the clean abstraction wgpu-hal tries to maintain and
// we should find ways to improve this. See https://github.com/gfx-rs/wgpu/issues/6538.
#[cfg(target_arch = "wasm32")]
{
break;
}
#[cfg(not(target_arch = "wasm32"))]
{
if is_last_iter {
panic!(
"We timed out while waiting on the last successful submission to complete!"
);
}
}
}
Err(e) => match e {
hal::DeviceError::OutOfMemory => {
if is_last_iter {
panic!(
"We ran into an OOM error while waiting on the last successful submission to complete!"
);
}
}
hal::DeviceError::Lost => {
self.device.handle_hal_error(e); // will lose the device
break;
}
hal::DeviceError::ResourceCreationFailed => unreachable!(),
hal::DeviceError::Unexpected => {
panic!(
"We ran into an unexpected error while waiting on the last successful submission to complete!"
);
}
},
} }
} }
drop(fence);
let snatch_guard = self.device.snatchable_lock.read(); let snatch_guard = self.device.snatchable_lock.read();
let (submission_closures, mapping_closures, queue_empty) = let (submission_closures, mapping_closures, queue_empty) =