Add Benchmarks (#5694)

2024-11-21 22:33:49 +00:00 · 2024-05-16 09:05:41 -04:00 · 2024-05-16 09:05:41 -04:00 · eeb1a9d7b7
commit eeb1a9d7b7
parent 3a798859cd
25 changed files with 1674 additions and 380 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@ -3,7 +3,17 @@
 [profile.default]
 slow-timeout = { period = "45s", terminate-after = 2 }
-# Use two threads for tests with "2_threads" in their name
+# Use two threads for tests with "2 threads" in their name
 [[profile.default.overrides]]
-filter = 'test(~2_threads)'
+filter = 'test(~2_threads) | test(~2 threads)'
 threads-required = 2
 # Use four threads for tests with "4 threads" in their name
 [[profile.default.overrides]]
 filter = 'test(~4_threads) | test(~4 threads)'
 threads-required = 4
 # Use eight threads for tests with "8 threads" in their name
 [[profile.default.overrides]]
 filter = 'test(~8_threads) | test(~8 threads)'
 threads-required = 8
--- a/.deny.toml
+++ b/.deny.toml
@ -1,6 +1,8 @@
 [bans]
 multiple-versions = "deny"
 skip-tree = [
 	# We never enable loom in any of our dependencies but it causes dupes
 	{ name = "loom", version = "0.7.2" },
 	{ name = "windows-sys", version = "0.45" },
 	{ name = "winit", version = "0.27" },
 	{ name = "winit", version = "0.29" },
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -226,7 +226,7 @@ jobs:
          cargo clippy --target ${{ matrix.target }} --no-default-features
          # Check with all features.
-          cargo clippy --target ${{ matrix.target }} --tests --all-features
+          cargo clippy --target ${{ matrix.target }} --tests --benches --all-features
          # build docs
          cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} --all-features --no-deps
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1511,6 +1511,20 @@ dependencies = [
 "slab",
 ]
 [[package]]
 name = "generator"
 version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "186014d53bc231d0090ef8d6f03e0920c54d85a5ed22f4f2f74315ec56cf83fb"
 dependencies = [
 "cc",
 "cfg-if",
 "libc",
 "log",
 "rustversion",
 "windows 0.54.0",
 ]
 [[package]]
 name = "gethostname"
 version = "0.4.3"
@ -1672,7 +1686,7 @@ dependencies = [
 "presser",
 "thiserror",
 "winapi",
- "windows",
+ "windows 0.52.0",
 ]
 [[package]]
@ -2047,6 +2061,19 @@ version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 [[package]]
 name = "loom"
 version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca"
 dependencies = [
 "cfg-if",
 "generator",
 "scoped-tls",
 "tracing",
 "tracing-subscriber",
 ]
 [[package]]
 name = "malloc_buf"
 version = "0.0.6"
@ -2056,6 +2083,15 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "matchers"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
 dependencies = [
 "regex-automata 0.1.10",
 ]
 [[package]]
 name = "memchr"
 version = "2.7.2"
@ -2141,11 +2177,9 @@ version = "0.20.0"
 dependencies = [
 "arbitrary",
 "arrayvec 0.7.4",
 "bincode",
 "bit-set",
 "bitflags 2.5.0",
 "codespan-reporting",
 "criterion",
 "diff",
 "env_logger",
 "hexf-parse",
@ -2326,6 +2360,16 @@ dependencies = [
 "rand_xorshift",
 ]
 [[package]]
 name = "nu-ansi-term"
 version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
 dependencies = [
 "overload",
 "winapi",
 ]
 [[package]]
 name = "num-bigint"
 version = "0.4.5"
@ -2513,6 +2557,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 [[package]]
 name = "overload"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "owned_ttf_parser"
 version = "0.21.0"
@ -2892,8 +2942,17 @@ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
 dependencies = [
 "aho-corasick",
 "memchr",
- "regex-automata",
+ "regex-automata 0.4.6",
- "regex-syntax",
+ "regex-syntax 0.8.3",
 ]
 [[package]]
 name = "regex-automata"
 version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
 dependencies = [
 "regex-syntax 0.6.29",
 ]
 [[package]]
@ -2904,9 +2963,15 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
 dependencies = [
 "aho-corasick",
 "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.3",
 ]
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 [[package]]
 name = "regex-syntax"
 version = "0.8.3"
@ -3138,6 +3203,15 @@ dependencies = [
 "pkg-config",
 ]
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
 dependencies = [
 "lazy_static",
 ]
 [[package]]
 name = "shared_library"
 version = "0.1.9"
@ -3410,6 +3484,16 @@ dependencies = [
 "winapi",
 ]
 [[package]]
 name = "thread_local"
 version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
 dependencies = [
 "cfg-if",
 "once_cell",
 ]
 [[package]]
 name = "threadpool"
 version = "1.8.1"
@ -3567,6 +3651,59 @@ name = "tracing-core"
 version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
 dependencies = [
 "once_cell",
 "valuable",
 ]
 [[package]]
 name = "tracing-log"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
 dependencies = [
 "log",
 "once_cell",
 "tracing-core",
 ]
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
 dependencies = [
 "matchers",
 "nu-ansi-term",
 "once_cell",
 "regex",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
 ]
 [[package]]
 name = "tracy-client"
 version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59fb931a64ff88984f86d3e9bcd1ae8843aa7fe44dd0f8097527bc172351741d"
 dependencies = [
 "loom",
 "once_cell",
 "tracy-client-sys",
 ]
 [[package]]
 name = "tracy-client-sys"
 version = "0.22.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9d104d610dfa9dd154535102cc9c6164ae1fa37842bc2d9e83f9ac82b0ae0882"
 dependencies = [
 "cc",
 ]
 [[package]]
 name = "ttf-parser"
@ -3716,6 +3853,12 @@ dependencies = [
 "which",
 ]
 [[package]]
 name = "valuable"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 [[package]]
 name = "vec_map"
 version = "0.8.2"
@ -4077,6 +4220,23 @@ dependencies = [
 "wgpu-types",
 ]
 [[package]]
 name = "wgpu-benchmark"
 version = "0.20.0"
 dependencies = [
 "bincode",
 "bytemuck",
 "criterion",
 "naga",
 "nanorand",
 "once_cell",
 "pollster",
 "profiling",
 "rayon",
 "tracy-client",
 "wgpu",
 ]
 [[package]]
 name = "wgpu-core"
 version = "0.20.0"
@ -4304,7 +4464,17 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
- "windows-core",
+ "windows-core 0.52.0",
 "windows-targets 0.52.5",
 ]
 [[package]]
 name = "windows"
 version = "0.54.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9252e5725dbed82865af151df558e754e4a3c2c30818359eb17465f1346a1b49"
 dependencies = [
 "windows-core 0.54.0",
 "windows-targets 0.52.5",
 ]
@ -4317,6 +4487,25 @@ dependencies = [
 "windows-targets 0.52.5",
 ]
 [[package]]
 name = "windows-core"
 version = "0.54.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "12661b9c89351d684a50a8a643ce5f608e20243b9fb84687800163429f161d65"
 dependencies = [
 "windows-result",
 "windows-targets 0.52.5",
 ]
 [[package]]
 name = "windows-result"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "749f0da9cc72d82e600d8d2e44cadd0b9eedb9038f71a1c58556ac1c5791813b"
 dependencies = [
 "windows-targets 0.52.5",
 ]
 [[package]]
 name = "windows-sys"
 version = "0.36.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,8 +5,9 @@ members = [
    "deno_webgpu",
    # default members
    "benches",
    "d3d12",
-    "examples/",
+    "examples",
    "naga-cli",
    "naga",
    "naga/fuzz",
@ -22,8 +23,9 @@ members = [
 ]
 exclude = []
 default-members = [
    "benches",
    "d3d12",
-    "examples/",
+    "examples",
    "naga-cli",
    "naga",
    "naga/fuzz",
@ -70,11 +72,13 @@ version = "0.20.0"
 [workspace.dependencies]
 anyhow = "1.0.23"
 arrayvec = "0.7"
 bincode = "1"
 bit-vec = "0.6"
 bitflags = "2"
 bytemuck = { version = "1.14", features = ["derive"] }
 cfg_aliases = "0.1"
 cfg-if = "1"
 criterion = "0.5"
 codespan-reporting = "0.11"
 ctor = "0.2"
 document-features = "0.2.8"
@ -109,6 +113,7 @@ png = "0.17.11"
 pollster = "0.3"
 profiling = { version = "1", default-features = false }
 raw-window-handle = "0.6"
 rayon = "1"
 renderdoc-sys = "1.1.0"
 ron = "0.8"
 rustc-hash = "1.1.0"
@ -116,6 +121,7 @@ serde = "1"
 serde_json = "1.0.116"
 smallvec = "1"
 static_assertions = "1.1.0"
 tracy-client = "0.17"
 thiserror = "1"
 wgpu = { version = "0.20.0", path = "./wgpu" }
 wgpu-core = { version = "0.20.0", path = "./wgpu-core" }
@ -187,6 +193,10 @@ termcolor = "1.4.1"
 #js-sys = { path = "../wasm-bindgen/crates/js-sys" }
 #wasm-bindgen = { path = "../wasm-bindgen" }
 [profile.release]
 lto = "thin"
 debug = true
 # Speed up image comparison even in debug builds
 [profile.dev.package."nv-flip-sys"]
 opt-level = 3
--- a/benches/Cargo.toml
+++ b/benches/Cargo.toml
@ -0,0 +1,46 @@
 [package]
 name = "wgpu-benchmark"
 version.workspace = true
 authors.workspace = true
 edition.workspace = true
 description = "wgpu benchmarking suite"
 homepage.workspace = true
 repository.workspace = true
 keywords.workspace = true
 license.workspace = true
 autobenches = false
 publish = false
 [[bench]]
 name = "root"
 harness = false
 path = "benches/root.rs"
 [features]
 # Uncomment these features to enable tracy and superluminal profiling.
 # tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
 # superluminal = ["profiling/profile-with-superluminal"]
 [dependencies]
 bincode.workspace = true
 bytemuck.workspace = true
 criterion.workspace = true
 naga = { workspace = true, features = [
    "deserialize",
    "serialize",
    "wgsl-in",
    "spv-in",
    "glsl-in",
    "spv-out",
    "msl-out",
    "hlsl-out",
    "glsl-out",
    "wgsl-out",
 ] }
 nanorand.workspace = true
 once_cell.workspace = true
 pollster.workspace = true
 profiling.workspace = true
 rayon.workspace = true
 tracy-client = { workspace = true, optional = true }
 wgpu.workspace = true
--- a/benches/README.md
+++ b/benches/README.md
@ -0,0 +1,95 @@
 Collection of CPU benchmarks for `wgpu`.
 These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
 They all do very little GPU work and are testing the CPU performance of the API.
 Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
 ## Usage
 ```sh
 # Run all benchmarks
 cargo bench -p wgpu-benchmark
 # Run a specific benchmarks that contains "filter" in its name
 cargo bench -p wgpu-benchmark -- "filter"
 ```
 ## Benchmarks
 #### `Renderpass`
 This benchmark measures the performance of recording and submitting a render pass with a large
 number of draw calls and resources, emulating an intense, more traditional graphics application. 
 By default it measures 10k draw calls, with 90k total resources.
 Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
 the render pass into multiple passes over multiple command buffers.
 #### `Resource Creation`
 This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
 #### `Shader Compilation`
 This benchmark measures the performance of naga parsing, validating, and generating shaders. 
 ## Comparing Against a Baseline
 To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
 For example, to compare v0.20 against trunk, you could run the following:
 ```sh
 git checkout v0.20
 # Run the baseline benchmarks
 cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
 git checkout trunk
 # Run the current benchmarks
 cargo bench -p wgpu-benchmark -- --baseline "v0.20"
 ```
 You can use this for any bits of code you want to compare.
 ## Integration with Profilers
 The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
 Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
 you need to uncomment the features in the `Cargo.toml` to allow features to be used.
 #### Tracy
 Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/tracy/releases/latest/).
 ```sh
 # Once this is running, you can connect to it with the Tracy Profiler
 cargo bench -p wgpu-benchmark --features tracy
 ```
 #### Superluminal
 Superluminal is a paid product for windows available [here](https://superluminal.eu/).
 ```sh
 # This command will build the benchmarks, and display the path to the executable
 cargo bench -p wgpu-benchmark --features superluminal -- -h
 # Have Superluminal run the following command (replacing with the path to the executable)
 ./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
 ```
 #### `perf` and others
 You can follow the same pattern as above to run the benchmarks with other profilers.
 For example, the command line tool `perf` can be used to profile the benchmarks.
 ```sh
 # This command will build the benchmarks, and display the path to the executable
 cargo bench -p wgpu-benchmark -- -h
 # Run the benchmarks with perf
 perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
 ```
--- a/benches/benches/renderpass-bindless.wgsl
+++ b/benches/benches/renderpass-bindless.wgsl
@ -0,0 +1,26 @@
@group(0) @binding(0)
 var tex: binding_array<texture_2d<f32>>;
 struct VertexOutput {
    @builtin(position) position: vec4f,
    @location(0) @interpolate(flat) instance_index: u32,
 }
@vertex
 fn vs_main(@builtin(instance_index) instance_index: u32) -> VertexOutput {
    return VertexOutput(
        vec4f(0.0, 0.0, 0.0, 1.0),
        instance_index
    );
 }
@fragment
 fn fs_main(vs_in: VertexOutput) -> @location(0) vec4f {
    return textureLoad(tex[7 * vs_in.instance_index + 0], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 1], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 2], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 3], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 4], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 5], vec2u(0), 0) +
           textureLoad(tex[7 * vs_in.instance_index + 6], vec2u(0), 0); 
 }
--- a/benches/benches/renderpass.rs
+++ b/benches/benches/renderpass.rs
@ -0,0 +1,573 @@
 use std::{
    num::NonZeroU32,
    time::{Duration, Instant},
 };
 use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
 use once_cell::sync::Lazy;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use crate::DeviceState;
 const DRAW_COUNT: usize = 10_000;
 // Must match the number of textures in the renderpass.wgsl shader
 const TEXTURES_PER_DRAW: usize = 7;
 const VERTEX_BUFFERS_PER_DRAW: usize = 2;
 const VERTEX_BUFFER_COUNT: usize = DRAW_COUNT * VERTEX_BUFFERS_PER_DRAW;
 const TEXTURE_COUNT: usize = DRAW_COUNT * TEXTURES_PER_DRAW;
 struct RenderpassState {
    device_state: DeviceState,
    pipeline: wgpu::RenderPipeline,
    bind_groups: Vec<wgpu::BindGroup>,
    vertex_buffers: Vec<wgpu::Buffer>,
    index_buffers: Vec<wgpu::Buffer>,
    render_target: wgpu::TextureView,
    // Bindless resources
    bindless_bind_group: Option<wgpu::BindGroup>,
    bindless_pipeline: Option<wgpu::RenderPipeline>,
 }
 impl RenderpassState {
    /// Create and prepare all the resources needed for the renderpass benchmark.
    fn new() -> Self {
        let device_state = DeviceState::new();
        let supports_bindless = device_state.device.features().contains(
            wgpu::Features::TEXTURE_BINDING_ARRAY
                | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
        ) && device_state
            .device
            .limits()
            .max_sampled_textures_per_shader_stage
            >= TEXTURE_COUNT as _;
        // Performance gets considerably worse if the resources are shuffled.
        //
        // This more closely matches the real-world use case where resources have no
        // well defined usage order.
        let mut random = WyRand::new_seed(0x8BADF00D);
        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
        for i in 0..TEXTURES_PER_DRAW {
            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
                binding: i as u32,
                visibility: wgpu::ShaderStages::FRAGMENT,
                ty: wgpu::BindingType::Texture {
                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
                    view_dimension: wgpu::TextureViewDimension::D2,
                    multisampled: false,
                },
                count: None,
            });
        }
        let bind_group_layout =
            device_state
                .device
                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
                    label: None,
                    entries: &bind_group_layout_entries,
                });
        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
        for i in 0..TEXTURE_COUNT {
            let texture = device_state
                .device
                .create_texture(&wgpu::TextureDescriptor {
                    label: Some(&format!("Texture {i}")),
                    size: wgpu::Extent3d {
                        width: 1,
                        height: 1,
                        depth_or_array_layers: 1,
                    },
                    mip_level_count: 1,
                    sample_count: 1,
                    dimension: wgpu::TextureDimension::D2,
                    format: wgpu::TextureFormat::Rgba8UnormSrgb,
                    usage: wgpu::TextureUsages::TEXTURE_BINDING,
                    view_formats: &[],
                });
            texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
                label: Some(&format!("Texture View {i}")),
                ..Default::default()
            }));
        }
        random.shuffle(&mut texture_views);
        let texture_view_refs: Vec<_> = texture_views.iter().collect();
        let mut bind_groups = Vec::with_capacity(DRAW_COUNT);
        for draw_idx in 0..DRAW_COUNT {
            let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
            for tex_idx in 0..TEXTURES_PER_DRAW {
                entries.push(wgpu::BindGroupEntry {
                    binding: tex_idx as u32,
                    resource: wgpu::BindingResource::TextureView(
                        &texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
                    ),
                });
            }
            bind_groups.push(
                device_state
                    .device
                    .create_bind_group(&wgpu::BindGroupDescriptor {
                        label: None,
                        layout: &bind_group_layout,
                        entries: &entries,
                    }),
            );
        }
        random.shuffle(&mut bind_groups);
        let sm = device_state
            .device
            .create_shader_module(wgpu::include_wgsl!("renderpass.wgsl"));
        let pipeline_layout =
            device_state
                .device
                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                    label: None,
                    bind_group_layouts: &[&bind_group_layout],
                    push_constant_ranges: &[],
                });
        let mut vertex_buffers = Vec::with_capacity(VERTEX_BUFFER_COUNT);
        for _ in 0..VERTEX_BUFFER_COUNT {
            vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
                label: None,
                size: 3 * 16,
                usage: wgpu::BufferUsages::VERTEX,
                mapped_at_creation: false,
            }));
        }
        random.shuffle(&mut vertex_buffers);
        let mut index_buffers = Vec::with_capacity(DRAW_COUNT);
        for _ in 0..DRAW_COUNT {
            index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
                label: None,
                size: 3 * 4,
                usage: wgpu::BufferUsages::INDEX,
                mapped_at_creation: false,
            }));
        }
        random.shuffle(&mut index_buffers);
        let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
        for i in 0..VERTEX_BUFFERS_PER_DRAW {
            vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
        }
        let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
        for attributes in &vertex_buffer_attributes {
            vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
                array_stride: 16,
                step_mode: wgpu::VertexStepMode::Vertex,
                attributes,
            });
        }
        let pipeline =
            device_state
                .device
                .create_render_pipeline(&wgpu::RenderPipelineDescriptor {
                    label: None,
                    layout: Some(&pipeline_layout),
                    vertex: wgpu::VertexState {
                        module: &sm,
                        entry_point: "vs_main",
                        buffers: &vertex_buffer_layouts,
                        compilation_options: wgpu::PipelineCompilationOptions::default(),
                    },
                    primitive: wgpu::PrimitiveState {
                        topology: wgpu::PrimitiveTopology::TriangleList,
                        strip_index_format: None,
                        front_face: wgpu::FrontFace::Cw,
                        cull_mode: Some(wgpu::Face::Back),
                        polygon_mode: wgpu::PolygonMode::Fill,
                        unclipped_depth: false,
                        conservative: false,
                    },
                    depth_stencil: None,
                    multisample: wgpu::MultisampleState::default(),
                    fragment: Some(wgpu::FragmentState {
                        module: &sm,
                        entry_point: "fs_main",
                        targets: &[Some(wgpu::ColorTargetState {
                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
                            blend: None,
                            write_mask: wgpu::ColorWrites::ALL,
                        })],
                        compilation_options: wgpu::PipelineCompilationOptions::default(),
                    }),
                    multiview: None,
                });
        let render_target = device_state
            .device
            .create_texture(&wgpu::TextureDescriptor {
                label: Some("Render Target"),
                size: wgpu::Extent3d {
                    width: 1,
                    height: 1,
                    depth_or_array_layers: 1,
                },
                mip_level_count: 1,
                sample_count: 1,
                dimension: wgpu::TextureDimension::D2,
                format: wgpu::TextureFormat::Rgba8UnormSrgb,
                usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
                view_formats: &[],
            })
            .create_view(&wgpu::TextureViewDescriptor::default());
        let mut bindless_bind_group = None;
        let mut bindless_pipeline = None;
        if supports_bindless {
            let bindless_bind_group_layout =
                device_state
                    .device
                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
                        label: None,
                        entries: &[wgpu::BindGroupLayoutEntry {
                            binding: 0,
                            visibility: wgpu::ShaderStages::FRAGMENT,
                            ty: wgpu::BindingType::Texture {
                                sample_type: wgpu::TextureSampleType::Float { filterable: true },
                                view_dimension: wgpu::TextureViewDimension::D2,
                                multisampled: false,
                            },
                            count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
                        }],
                    });
            bindless_bind_group = Some(device_state.device.create_bind_group(
                &wgpu::BindGroupDescriptor {
                    label: None,
                    layout: &bindless_bind_group_layout,
                    entries: &[wgpu::BindGroupEntry {
                        binding: 0,
                        resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
                    }],
                },
            ));
            let bindless_shader_module = device_state
                .device
                .create_shader_module(wgpu::include_wgsl!("renderpass-bindless.wgsl"));
            let bindless_pipeline_layout =
                device_state
                    .device
                    .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                        label: None,
                        bind_group_layouts: &[&bindless_bind_group_layout],
                        push_constant_ranges: &[],
                    });
            bindless_pipeline = Some(device_state.device.create_render_pipeline(
                &wgpu::RenderPipelineDescriptor {
                    label: None,
                    layout: Some(&bindless_pipeline_layout),
                    vertex: wgpu::VertexState {
                        module: &bindless_shader_module,
                        entry_point: "vs_main",
                        buffers: &vertex_buffer_layouts,
                        compilation_options: wgpu::PipelineCompilationOptions::default(),
                    },
                    primitive: wgpu::PrimitiveState {
                        topology: wgpu::PrimitiveTopology::TriangleList,
                        strip_index_format: None,
                        front_face: wgpu::FrontFace::Cw,
                        cull_mode: Some(wgpu::Face::Back),
                        polygon_mode: wgpu::PolygonMode::Fill,
                        unclipped_depth: false,
                        conservative: false,
                    },
                    depth_stencil: None,
                    multisample: wgpu::MultisampleState::default(),
                    fragment: Some(wgpu::FragmentState {
                        module: &bindless_shader_module,
                        entry_point: "fs_main",
                        targets: &[Some(wgpu::ColorTargetState {
                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
                            blend: None,
                            write_mask: wgpu::ColorWrites::ALL,
                        })],
                        compilation_options: wgpu::PipelineCompilationOptions::default(),
                    }),
                    multiview: None,
                },
            ));
        }
        Self {
            device_state,
            pipeline,
            bind_groups,
            vertex_buffers,
            index_buffers,
            render_target,
            bindless_bind_group,
            bindless_pipeline,
        }
    }
    fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
        profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
        let draws_per_pass = DRAW_COUNT / total_passes;
        let mut encoder = self
            .device_state
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: None,
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: &self.render_target,
                resolve_target: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
                    store: wgpu::StoreOp::Store,
                },
            })],
            occlusion_query_set: None,
            timestamp_writes: None,
            depth_stencil_attachment: None,
        });
        let start_idx = pass_number * draws_per_pass;
        let end_idx = start_idx + draws_per_pass;
        for draw_idx in start_idx..end_idx {
            render_pass.set_pipeline(&self.pipeline);
            render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
            for i in 0..VERTEX_BUFFERS_PER_DRAW {
                render_pass.set_vertex_buffer(
                    i as u32,
                    self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
                );
            }
            render_pass.set_index_buffer(
                self.index_buffers[draw_idx].slice(..),
                wgpu::IndexFormat::Uint32,
            );
            render_pass.draw_indexed(0..3, 0, 0..1);
        }
        drop(render_pass);
        encoder.finish()
    }
    fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
        profiling::scope!("Bindless Renderpass");
        let mut encoder = self
            .device_state
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: None,
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: &self.render_target,
                resolve_target: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
                    store: wgpu::StoreOp::Store,
                },
            })],
            occlusion_query_set: None,
            timestamp_writes: None,
            depth_stencil_attachment: None,
        });
        render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
        render_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
        for i in 0..VERTEX_BUFFERS_PER_DRAW {
            render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
        }
        render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
        for draw_idx in 0..DRAW_COUNT {
            render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
        }
        drop(render_pass);
        encoder.finish()
    }
 }
 fn run_bench(ctx: &mut Criterion) {
    let state = Lazy::new(RenderpassState::new);
    // Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
    let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
    group.throughput(Throughput::Elements(DRAW_COUNT as _));
    for time_submit in [false, true] {
        for rpasses in [1, 2, 4, 8] {
            let draws_per_pass = DRAW_COUNT / rpasses;
            let label = if time_submit {
                "Submit Time"
            } else {
                "Renderpass Time"
            };
            group.bench_function(
                &format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
                |b| {
                    Lazy::force(&state);
                    b.iter_custom(|iters| {
                        profiling::scope!("benchmark invocation");
                        // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
                        if state.device_state.adapter_info.name.contains("Paravirtual") {
                            return Duration::from_secs_f32(1.0);
                        }
                        let mut duration = Duration::ZERO;
                        for _ in 0..iters {
                            profiling::scope!("benchmark iteration");
                            let mut start = Instant::now();
                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
                            for i in 0..rpasses {
                                buffers.push(state.run_subpass(i, rpasses));
                            }
                            if time_submit {
                                start = Instant::now();
                            } else {
                                duration += start.elapsed();
                            }
                            state.device_state.queue.submit(buffers);
                            if time_submit {
                                duration += start.elapsed();
                            }
                            state.device_state.device.poll(wgpu::Maintain::Wait);
                        }
                        duration
                    })
                },
            );
        }
    }
    group.finish();
    // Test 10k draw calls split up over 2, 4, and 8 threads.
    let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
    group.throughput(Throughput::Elements(DRAW_COUNT as _));
    for threads in [2, 4, 8] {
        let draws_per_pass = DRAW_COUNT / threads;
        group.bench_function(
            &format!("{threads} threads x {draws_per_pass} draws"),
            |b| {
                Lazy::force(&state);
                b.iter_custom(|iters| {
                    profiling::scope!("benchmark invocation");
                    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
                    if state.device_state.adapter_info.name.contains("Paravirtual") {
                        return Duration::from_secs_f32(1.0);
                    }
                    let mut duration = Duration::ZERO;
                    for _ in 0..iters {
                        profiling::scope!("benchmark iteration");
                        let start = Instant::now();
                        let buffers = (0..threads)
                            .into_par_iter()
                            .map(|i| state.run_subpass(i, threads))
                            .collect::<Vec<_>>();
                        duration += start.elapsed();
                        state.device_state.queue.submit(buffers);
                        state.device_state.device.poll(wgpu::Maintain::Wait);
                    }
                    duration
                })
            },
        );
    }
    group.finish();
    // Test 10k draw calls split up over 1, 2, 4, and 8 threads.
    let mut group = ctx.benchmark_group("Renderpass: Bindless");
    group.throughput(Throughput::Elements(DRAW_COUNT as _));
    group.bench_function(&format!("{DRAW_COUNT} draws"), |b| {
        Lazy::force(&state);
        b.iter_custom(|iters| {
            profiling::scope!("benchmark invocation");
            // Need bindless to run this benchmark
            if state.bindless_bind_group.is_none() {
                return Duration::from_secs_f32(1.0);
            }
            let mut duration = Duration::ZERO;
            for _ in 0..iters {
                profiling::scope!("benchmark iteration");
                let start = Instant::now();
                let buffer = state.run_bindless_pass();
                duration += start.elapsed();
                state.device_state.queue.submit([buffer]);
                state.device_state.device.poll(wgpu::Maintain::Wait);
            }
            duration
        })
    });
    group.finish();
    ctx.bench_function(
        &format!(
            "Renderpass: Empty Submit with {} Resources",
            TEXTURE_COUNT + VERTEX_BUFFER_COUNT
        ),
        |b| {
            Lazy::force(&state);
            b.iter(|| state.device_state.queue.submit([]));
        },
    );
 }
 criterion_group! {
    name = renderpass;
    config = Criterion::default().measurement_time(Duration::from_secs(10));
    targets = run_bench,
 }
--- a/benches/benches/renderpass.wgsl
+++ b/benches/benches/renderpass.wgsl
@ -0,0 +1,36 @@
@group(0) @binding(0)
 var tex_1: texture_2d<f32>;
@group(0) @binding(1)
 var tex_2: texture_2d<f32>;
@group(0) @binding(2)
 var tex_3: texture_2d<f32>;
@group(0) @binding(3)
 var tex_4: texture_2d<f32>;
@group(0) @binding(4)
 var tex_5: texture_2d<f32>;
@group(0) @binding(5)
 var tex_6: texture_2d<f32>;
@group(0) @binding(6)
 var tex_7: texture_2d<f32>;
@vertex
 fn vs_main() -> @builtin(position) vec4f {
    return vec4f(0.0, 0.0, 0.0, 1.0);
 }
@fragment
 fn fs_main() -> @location(0) vec4f {
    return textureLoad(tex_1, vec2u(0), 0) +
           textureLoad(tex_2, vec2u(0), 0) +
           textureLoad(tex_3, vec2u(0), 0) +
           textureLoad(tex_4, vec2u(0), 0) +
           textureLoad(tex_5, vec2u(0), 0) +
           textureLoad(tex_6, vec2u(0), 0) +
           textureLoad(tex_7, vec2u(0), 0); 
 }
--- a/benches/benches/resource_creation.rs
+++ b/benches/benches/resource_creation.rs
@ -0,0 +1,71 @@
 use std::time::{Duration, Instant};
 use criterion::{criterion_group, Criterion, Throughput};
 use once_cell::sync::Lazy;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use crate::DeviceState;
 fn run_bench(ctx: &mut Criterion) {
    let state = Lazy::new(DeviceState::new);
    const RESOURCES_TO_CREATE: usize = 8;
    let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
    group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
    for threads in [1, 2, 4, 8] {
        let resources_per_thread = RESOURCES_TO_CREATE / threads;
        group.bench_function(
            &format!("{threads} threads x {resources_per_thread} resource"),
            |b| {
                Lazy::force(&state);
                b.iter_custom(|iters| {
                    profiling::scope!("benchmark invocation");
                    let mut duration = Duration::ZERO;
                    for _ in 0..iters {
                        profiling::scope!("benchmark iteration");
                        // We can't create too many resources at once, so we do it 8 resources at a time.
                        let start = Instant::now();
                        let buffers = (0..threads)
                            .into_par_iter()
                            .map(|_| {
                                (0..resources_per_thread)
                                    .map(|_| {
                                        state.device.create_buffer(&wgpu::BufferDescriptor {
                                            label: None,
                                            size: 256 * 1024 * 1024,
                                            usage: wgpu::BufferUsages::COPY_DST,
                                            mapped_at_creation: false,
                                        })
                                    })
                                    .collect::<Vec<_>>()
                            })
                            .collect::<Vec<_>>();
                        duration += start.elapsed();
                        drop(buffers);
                        state.queue.submit([]);
                        state.device.poll(wgpu::Maintain::Wait);
                    }
                    duration
                })
            },
        );
    }
    group.finish();
 }
 criterion_group! {
    name = resource_creation;
    config = Criterion::default().measurement_time(Duration::from_secs(10));
    targets = run_bench,
 }
--- a/benches/benches/root.rs
+++ b/benches/benches/root.rs
@ -0,0 +1,65 @@
 use criterion::criterion_main;
 use pollster::block_on;
 mod renderpass;
 mod resource_creation;
 mod shader;
 struct DeviceState {
    adapter_info: wgpu::AdapterInfo,
    device: wgpu::Device,
    queue: wgpu::Queue,
 }
 impl DeviceState {
    fn new() -> Self {
        #[cfg(feature = "tracy")]
        tracy_client::Client::start();
        let base_backend = if cfg!(target_os = "macos") {
            // We don't want to use Molten-VK on Mac.
            wgpu::Backends::METAL
        } else {
            wgpu::Backends::all()
        };
        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
            backends: wgpu::util::backend_bits_from_env().unwrap_or(base_backend),
            flags: wgpu::InstanceFlags::empty(),
            dx12_shader_compiler: wgpu::util::dx12_shader_compiler_from_env()
                .unwrap_or(wgpu::Dx12Compiler::Fxc),
            gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
        });
        let adapter = block_on(wgpu::util::initialize_adapter_from_env_or_default(
            &instance, None,
        ))
        .unwrap();
        let adapter_info = adapter.get_info();
        eprintln!("{:?}", adapter_info);
        let (device, queue) = block_on(adapter.request_device(
            &wgpu::DeviceDescriptor {
                required_features: adapter.features(),
                required_limits: adapter.limits(),
                label: Some("RenderPass Device"),
            },
            None,
        ))
        .unwrap();
        Self {
            adapter_info,
            device,
            queue,
        }
    }
 }
 criterion_main!(
    renderpass::renderpass,
    resource_creation::resource_creation,
    shader::shader
 );
--- a/benches/benches/shader.rs
+++ b/benches/benches/shader.rs
@ -0,0 +1,355 @@
 use criterion::*;
 use std::{fs, path::PathBuf};
 struct Input {
    filename: String,
    size: u64,
    data: Vec<u8>,
    string: Option<String>,
    module: Option<naga::Module>,
    module_info: Option<naga::valid::ModuleInfo>,
 }
 struct Inputs {
    inner: Vec<Input>,
 }
 impl Inputs {
    fn from_dir(folder: &str, extension: &str) -> Self {
        let mut inputs = Vec::new();
        let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join(folder)
            .read_dir()
            .unwrap();
        for file_entry in read_dir {
            match file_entry {
                Ok(entry) => match entry.path().extension() {
                    Some(ostr) if ostr == extension => {
                        let path = entry.path();
                        inputs.push(Input {
                            filename: path.to_string_lossy().into_owned(),
                            size: entry.metadata().unwrap().len(),
                            string: None,
                            data: vec![],
                            module: None,
                            module_info: None,
                        });
                    }
                    _ => continue,
                },
                Err(e) => {
                    eprintln!("Skipping file: {:?}", e);
                    continue;
                }
            }
        }
        Self { inner: inputs }
    }
    fn bytes(&self) -> u64 {
        self.inner.iter().map(|input| input.size).sum()
    }
    fn load(&mut self) {
        for input in &mut self.inner {
            if !input.data.is_empty() {
                continue;
            }
            input.data = fs::read(&input.filename).unwrap_or_default();
        }
    }
    fn load_utf8(&mut self) {
        self.load();
        for input in &mut self.inner {
            if input.string.is_some() {
                continue;
            }
            input.string = Some(std::str::from_utf8(&input.data).unwrap().to_string());
        }
    }
    fn parse(&mut self) {
        self.load_utf8();
        let mut parser = naga::front::wgsl::Frontend::new();
        for input in &mut self.inner {
            if input.module.is_some() {
                continue;
            }
            input.module = Some(parser.parse(input.string.as_ref().unwrap()).unwrap());
        }
    }
    fn validate(&mut self) {
        self.parse();
        let mut validator = naga::valid::Validator::new(
            naga::valid::ValidationFlags::all(),
            // Note, this is empty, to let all backends work.
            naga::valid::Capabilities::empty(),
        );
        for input in &mut self.inner {
            if input.module_info.is_some() {
                continue;
            }
            input.module_info = validator.validate(input.module.as_ref().unwrap()).ok();
        }
        self.inner.retain(|input| input.module_info.is_some());
    }
 }
 fn parse_glsl(stage: naga::ShaderStage, inputs: &Inputs) {
    let mut parser = naga::front::glsl::Frontend::default();
    let options = naga::front::glsl::Options {
        stage,
        defines: Default::default(),
    };
    for input in &inputs.inner {
        parser
            .parse(&options, input.string.as_deref().unwrap())
            .unwrap();
    }
 }
 fn frontends(c: &mut Criterion) {
    let mut group = c.benchmark_group("front");
    let mut inputs_wgsl = Inputs::from_dir("../naga/tests/in", "wgsl");
    group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
    group.bench_function("shader: naga module bincode decode", |b| {
        inputs_wgsl.parse();
        let inputs_bin = inputs_wgsl
            .inner
            .iter()
            .map(|input| bincode::serialize(&input.module.as_ref().unwrap()).unwrap())
            .collect::<Vec<_>>();
        b.iter(move || {
            for input in inputs_bin.iter() {
                bincode::deserialize::<naga::Module>(input).unwrap();
            }
        });
    });
    group.bench_function("shader: wgsl-in", |b| {
        inputs_wgsl.load_utf8();
        let mut frontend = naga::front::wgsl::Frontend::new();
        b.iter(|| {
            for input in &inputs_wgsl.inner {
                frontend.parse(input.string.as_ref().unwrap()).unwrap();
            }
        });
    });
    let mut inputs_spirv = Inputs::from_dir("../naga/tests/in/spv", "spv");
    group.throughput(Throughput::Bytes(inputs_spirv.bytes()));
    group.bench_function("shader: spv-in", |b| {
        inputs_spirv.load();
        b.iter(|| {
            let options = naga::front::spv::Options::default();
            for input in &inputs_spirv.inner {
                let spv = bytemuck::cast_slice(&input.data);
                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
                parser.parse().unwrap();
            }
        });
    });
    let mut inputs_vertex = Inputs::from_dir("../naga/tests/in/glsl", "vert");
    let mut inputs_fragment = Inputs::from_dir("../naga/tests/in/glsl", "frag");
    // let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
    group.throughput(Throughput::Bytes(
        inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
    ));
    group.bench_function("shader: glsl-in", |b| {
        inputs_vertex.load();
        inputs_vertex.load_utf8();
        inputs_fragment.load_utf8();
        // inputs_compute.load_utf8();
        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
        // TODO: This one hangs for some reason
        // b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
    });
 }
 fn validation(c: &mut Criterion) {
    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
    let mut group = c.benchmark_group("validate");
    group.throughput(Throughput::Bytes(inputs.bytes()));
    group.bench_function("shader: validation", |b| {
        inputs.load();
        inputs.load_utf8();
        inputs.parse();
        let mut validator = naga::valid::Validator::new(
            naga::valid::ValidationFlags::all(),
            naga::valid::Capabilities::all(),
        );
        validator
            .subgroup_stages(naga::valid::ShaderStages::all())
            .subgroup_operations(naga::valid::SubgroupOperationSet::all());
        b.iter(|| {
            for input in &inputs.inner {
                validator.validate(input.module.as_ref().unwrap()).unwrap();
            }
        });
    });
    group.finish();
 }
 fn backends(c: &mut Criterion) {
    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
    let mut group = c.benchmark_group("back");
    // While normally this would be done inside the bench_function callback, we need to
    // run this to properly know the size of the inputs, as any that fail validation
    // will be removed.
    inputs.validate();
    group.throughput(Throughput::Bytes(inputs.bytes()));
    group.bench_function("shader: wgsl-out", |b| {
        b.iter(|| {
            let mut string = String::new();
            let flags = naga::back::wgsl::WriterFlags::empty();
            for input in &inputs.inner {
                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
                let _ = writer.write(
                    input.module.as_ref().unwrap(),
                    input.module_info.as_ref().unwrap(),
                );
                string.clear();
            }
        });
    });
    group.bench_function("shader: spv-out", |b| {
        b.iter(|| {
            let mut data = Vec::new();
            let options = naga::back::spv::Options::default();
            for input in &inputs.inner {
                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
                let _ = writer.write(
                    input.module.as_ref().unwrap(),
                    input.module_info.as_ref().unwrap(),
                    None,
                    &None,
                    &mut data,
                );
                data.clear();
            }
        });
    });
    group.bench_function("shader: spv-out multiple entrypoints", |b| {
        b.iter(|| {
            let mut data = Vec::new();
            let options = naga::back::spv::Options::default();
            for input in &inputs.inner {
                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
                let module = input.module.as_ref().unwrap();
                for ep in module.entry_points.iter() {
                    let pipeline_options = naga::back::spv::PipelineOptions {
                        shader_stage: ep.stage,
                        entry_point: ep.name.clone(),
                    };
                    let _ = writer.write(
                        input.module.as_ref().unwrap(),
                        input.module_info.as_ref().unwrap(),
                        Some(&pipeline_options),
                        &None,
                        &mut data,
                    );
                    data.clear();
                }
            }
        });
    });
    group.bench_function("shader: msl-out", |b| {
        b.iter(|| {
            let mut string = String::new();
            let options = naga::back::msl::Options::default();
            for input in &inputs.inner {
                let pipeline_options = naga::back::msl::PipelineOptions::default();
                let mut writer = naga::back::msl::Writer::new(&mut string);
                let _ = writer.write(
                    input.module.as_ref().unwrap(),
                    input.module_info.as_ref().unwrap(),
                    &options,
                    &pipeline_options,
                );
                string.clear();
            }
        });
    });
    group.bench_function("shader: hlsl-out", |b| {
        b.iter(|| {
            let options = naga::back::hlsl::Options::default();
            let mut string = String::new();
            for input in &inputs.inner {
                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
                let _ = writer.write(
                    input.module.as_ref().unwrap(),
                    input.module_info.as_ref().unwrap(),
                ); // may fail on unimplemented things
                string.clear();
            }
        });
    });
    group.bench_function("shader: glsl-out multiple entrypoints", |b| {
        b.iter(|| {
            let mut string = String::new();
            let options = naga::back::glsl::Options {
                version: naga::back::glsl::Version::new_gles(320),
                writer_flags: naga::back::glsl::WriterFlags::empty(),
                binding_map: Default::default(),
                zero_initialize_workgroup_memory: true,
            };
            for input in &inputs.inner {
                let module = input.module.as_ref().unwrap();
                let info = input.module_info.as_ref().unwrap();
                for ep in module.entry_points.iter() {
                    let pipeline_options = naga::back::glsl::PipelineOptions {
                        shader_stage: ep.stage,
                        entry_point: ep.name.clone(),
                        multiview: None,
                    };
                    // might be `Err` if missing features
                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
                        &mut string,
                        module,
                        info,
                        &options,
                        &pipeline_options,
                        naga::proc::BoundsCheckPolicies::default(),
                    ) {
                        let _ = writer.write(); // might be `Err` if unsupported
                    }
                    string.clear();
                }
            }
        });
    });
 }
 criterion_group!(shader, frontends, validation, backends);
--- a/naga/Cargo.toml
+++ b/naga/Cargo.toml
@ -35,10 +35,6 @@ wgsl-out = []
 hlsl-out = []
 compact = []
 [[bench]]
 name = "criterion"
 harness = false
 [dependencies]
 arbitrary = { version = "1.3", features = ["derive"], optional = true }
 bitflags = "2.5"
@ -60,11 +56,7 @@ hexf-parse = { version = "0.2.1", optional = true }
 unicode-xid = { version = "0.2.3", optional = true }
 arrayvec.workspace = true
 [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
 criterion = { version = "0.5", features = [] }
 [dev-dependencies]
 bincode = "1"
 diff = "0.1"
 env_logger = "0.11"
 # This _cannot_ have a version specified. If it does, crates.io will look
--- a/naga/benches/criterion.rs
+++ b/naga/benches/criterion.rs
@ -1,273 +0,0 @@
 #![cfg(not(target_arch = "wasm32"))]
 #![allow(clippy::needless_borrowed_reference)]
 use criterion::*;
 use std::{fs, path::PathBuf, slice};
 fn gather_inputs(folder: &str, extension: &str) -> Vec<Box<[u8]>> {
    let mut list = Vec::new();
    let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join(folder)
        .read_dir()
        .unwrap();
    for file_entry in read_dir {
        match file_entry {
            Ok(entry) => match entry.path().extension() {
                Some(ostr) if ostr == extension => {
                    let input = fs::read(entry.path()).unwrap_or_default();
                    list.push(input.into_boxed_slice());
                }
                _ => continue,
            },
            Err(e) => {
                log::warn!("Skipping file: {:?}", e);
                continue;
            }
        }
    }
    list
 }
 fn parse_glsl(stage: naga::ShaderStage, inputs: &[Box<[u8]>]) {
    let mut parser = naga::front::glsl::Frontend::default();
    let options = naga::front::glsl::Options {
        stage,
        defines: Default::default(),
    };
    for input in inputs.iter() {
        let string = std::str::from_utf8(input).unwrap();
        parser.parse(&options, string).unwrap();
    }
 }
 fn frontends(c: &mut Criterion) {
    let mut group = c.benchmark_group("front");
    #[cfg(all(feature = "wgsl-in", feature = "serialize", feature = "deserialize"))]
    group.bench_function("bin", |b| {
        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
        let mut frontend = naga::front::wgsl::Frontend::new();
        let inputs_bin = inputs_wgsl
            .iter()
            .map(|input| {
                let string = std::str::from_utf8(input).unwrap();
                let module = frontend.parse(string).unwrap();
                bincode::serialize(&module).unwrap()
            })
            .collect::<Vec<_>>();
        b.iter(move || {
            for input in inputs_bin.iter() {
                bincode::deserialize::<naga::Module>(input).unwrap();
            }
        });
    });
    #[cfg(feature = "wgsl-in")]
    group.bench_function("wgsl", |b| {
        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
        let inputs = inputs_wgsl
            .iter()
            .map(|input| std::str::from_utf8(input).unwrap())
            .collect::<Vec<_>>();
        let mut frontend = naga::front::wgsl::Frontend::new();
        b.iter(move || {
            for &input in inputs.iter() {
                frontend.parse(input).unwrap();
            }
        });
    });
    #[cfg(feature = "spv-in")]
    group.bench_function("spv", |b| {
        let inputs = gather_inputs("tests/in/spv", "spv");
        b.iter(move || {
            let options = naga::front::spv::Options::default();
            for input in inputs.iter() {
                let spv =
                    unsafe { slice::from_raw_parts(input.as_ptr() as *const u32, input.len() / 4) };
                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
                parser.parse().unwrap();
            }
        });
    });
    #[cfg(feature = "glsl-in")]
    group.bench_function("glsl", |b| {
        let vert = gather_inputs("tests/in/glsl", "vert");
        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &vert));
        let frag = gather_inputs("tests/in/glsl", "frag");
        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &frag));
        //TODO: hangs for some reason!
        //let comp = gather_inputs("tests/in/glsl", "comp");
        //b.iter(move || parse_glsl(naga::ShaderStage::Compute, &comp));
    });
 }
 #[cfg(feature = "wgsl-in")]
 fn gather_modules() -> Vec<naga::Module> {
    let inputs = gather_inputs("tests/in", "wgsl");
    let mut frontend = naga::front::wgsl::Frontend::new();
    inputs
        .iter()
        .map(|input| {
            let string = std::str::from_utf8(input).unwrap();
            frontend.parse(string).unwrap()
        })
        .collect()
 }
 #[cfg(not(feature = "wgsl-in"))]
 fn gather_modules() -> Vec<naga::Module> {
    Vec::new()
 }
 fn validation(c: &mut Criterion) {
    let inputs = gather_modules();
    let mut group = c.benchmark_group("valid");
    group.bench_function("safe", |b| {
        let mut validator = naga::valid::Validator::new(
            naga::valid::ValidationFlags::all(),
            naga::valid::Capabilities::all(),
        );
        b.iter(|| {
            for input in inputs.iter() {
                validator.validate(input).unwrap();
            }
        });
    });
    group.bench_function("unsafe", |b| {
        let mut validator = naga::valid::Validator::new(
            naga::valid::ValidationFlags::empty(),
            naga::valid::Capabilities::all(),
        );
        b.iter(|| {
            for input in inputs.iter() {
                validator.validate(input).unwrap();
            }
        });
    });
 }
 fn backends(c: &mut Criterion) {
    let inputs = {
        let mut validator = naga::valid::Validator::new(
            naga::valid::ValidationFlags::empty(),
            naga::valid::Capabilities::default(),
        );
        let input_modules = gather_modules();
        input_modules
            .into_iter()
            .flat_map(|module| validator.validate(&module).ok().map(|info| (module, info)))
            .collect::<Vec<_>>()
    };
    let mut group = c.benchmark_group("back");
    #[cfg(feature = "wgsl-out")]
    group.bench_function("wgsl", |b| {
        b.iter(|| {
            let mut string = String::new();
            let flags = naga::back::wgsl::WriterFlags::empty();
            for &(ref module, ref info) in inputs.iter() {
                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
                writer.write(module, info).unwrap();
                string.clear();
            }
        });
    });
    #[cfg(feature = "spv-out")]
    group.bench_function("spv", |b| {
        b.iter(|| {
            let mut data = Vec::new();
            let options = naga::back::spv::Options::default();
            for &(ref module, ref info) in inputs.iter() {
                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
                writer.write(module, info, None, &None, &mut data).unwrap();
                data.clear();
            }
        });
    });
    #[cfg(feature = "spv-out")]
    group.bench_function("spv-separate", |b| {
        b.iter(|| {
            let mut data = Vec::new();
            let options = naga::back::spv::Options::default();
            for &(ref module, ref info) in inputs.iter() {
                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
                for ep in module.entry_points.iter() {
                    let pipeline_options = naga::back::spv::PipelineOptions {
                        shader_stage: ep.stage,
                        entry_point: ep.name.clone(),
                    };
                    writer
                        .write(module, info, Some(&pipeline_options), &None, &mut data)
                        .unwrap();
                    data.clear();
                }
            }
        });
    });
    #[cfg(feature = "msl-out")]
    group.bench_function("msl", |b| {
        b.iter(|| {
            let mut string = String::new();
            let options = naga::back::msl::Options::default();
            for &(ref module, ref info) in inputs.iter() {
                let pipeline_options = naga::back::msl::PipelineOptions::default();
                let mut writer = naga::back::msl::Writer::new(&mut string);
                writer
                    .write(module, info, &options, &pipeline_options)
                    .unwrap();
                string.clear();
            }
        });
    });
    #[cfg(feature = "hlsl-out")]
    group.bench_function("hlsl", |b| {
        b.iter(|| {
            let options = naga::back::hlsl::Options::default();
            let mut string = String::new();
            for &(ref module, ref info) in inputs.iter() {
                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
                let _ = writer.write(module, info); // may fail on unimplemented things
                string.clear();
            }
        });
    });
    #[cfg(feature = "glsl-out")]
    group.bench_function("glsl-separate", |b| {
        b.iter(|| {
            let mut string = String::new();
            let options = naga::back::glsl::Options {
                version: naga::back::glsl::Version::new_gles(320),
                writer_flags: naga::back::glsl::WriterFlags::empty(),
                binding_map: Default::default(),
                zero_initialize_workgroup_memory: true,
            };
            for &(ref module, ref info) in inputs.iter() {
                for ep in module.entry_points.iter() {
                    let pipeline_options = naga::back::glsl::PipelineOptions {
                        shader_stage: ep.stage,
                        entry_point: ep.name.clone(),
                        multiview: None,
                    };
                    // might be `Err` if missing features
                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
                        &mut string,
                        module,
                        info,
                        &options,
                        &pipeline_options,
                        naga::proc::BoundsCheckPolicies::default(),
                    ) {
                        let _ = writer.write(); // might be `Err` if unsupported
                    }
                    string.clear();
                }
            }
        });
    });
 }
 criterion_group!(criterion, frontends, validation, backends,);
 criterion_main!(criterion);
--- a/naga/fuzz/Cargo.toml
+++ b/naga/fuzz/Cargo.toml
@ -21,23 +21,27 @@ features = ["arbitrary", "spv-in", "wgsl-in", "glsl-in"]
 [[bin]]
 name = "spv_parser"
 path = "fuzz_targets/spv_parser.rs"
 bench = false
 test = false
 doc = false
 [[bin]]
 name = "wgsl_parser"
 path = "fuzz_targets/wgsl_parser.rs"
 bench = false
 test = false
 doc = false
 [[bin]]
 name = "glsl_parser"
 path = "fuzz_targets/glsl_parser.rs"
 bench = false
 test = false
 doc = false
 [[bin]]
 name = "ir"
 path = "fuzz_targets/ir.rs"
 bench = false
 test = false
 doc = false
--- a/naga/src/back/hlsl/help.rs
+++ b/naga/src/back/hlsl/help.rs
@ -1044,7 +1044,12 @@ impl<'a, W: Write> super::Writer<'a, W> {
                        crate::Expression::GlobalVariable(var_handle) => {
                            &module.global_variables[var_handle]
                        }
-                        ref other => unreachable!("Array length of base {:?}", other),
+                        ref other => {
                            return Err(super::Error::Unimplemented(format!(
                                "Array length of base {:?}",
                                other
                            )))
                        }
                    };
                    let storage_access = match global_var.space {
                        crate::AddressSpace::Storage { access } => access,
--- a/wgpu-core/src/command/memory_init.rs
+++ b/wgpu-core/src/command/memory_init.rs
@ -172,6 +172,8 @@ impl<A: HalApi> BakedCommands<A> {
        device_tracker: &mut Tracker<A>,
        snatch_guard: &SnatchGuard<'_>,
    ) -> Result<(), DestroyedBufferError> {
        profiling::scope!("initialize_buffer_memory");
        // Gather init ranges for each buffer so we can collapse them.
        // It is not possible to do this at an earlier point since previously
        // executed command buffer change the resource init state.
@ -276,6 +278,8 @@ impl<A: HalApi> BakedCommands<A> {
        device: &Device<A>,
        snatch_guard: &SnatchGuard<'_>,
    ) -> Result<(), DestroyedTextureError> {
        profiling::scope!("initialize_texture_memory");
        let mut ranges: Vec<TextureInitRange> = Vec::new();
        for texture_use in self.texture_memory_actions.drain_init_actions() {
            let mut initialization_status = texture_use.texture.initialization_status.write();
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@ -32,7 +32,9 @@ pub const SHADER_STAGE_COUNT: usize = hal::MAX_CONCURRENT_SHADER_STAGES;
 // value is enough for a 16k texture with float4 format.
 pub(crate) const ZERO_BUFFER_SIZE: BufferAddress = 512 << 10;
-const CLEANUP_WAIT_MS: u32 = 5000;
+// If a submission is not completed within this time, we go off into UB land.
 // See https://github.com/gfx-rs/wgpu/issues/4589. 60s to reduce the chances of this.
 const CLEANUP_WAIT_MS: u32 = 60000;
 const IMPLICIT_BIND_GROUP_LAYOUT_ERROR_LABEL: &str = "Implicit BindGroupLayout in the Error State";
 const ENTRYPOINT_FAILURE_ERROR: &str = "The given EntryPoint is Invalid";
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@ -1186,6 +1186,8 @@ impl Global {
                    // finish all the command buffers first
                    for &cmb_id in command_buffer_ids {
                        profiling::scope!("process command buffer");
                        // we reset the used surface textures every time we use
                        // it, so make sure to set_size on it.
                        used_surface_textures.set_size(device.tracker_indices.textures.size());
@ -1222,13 +1224,15 @@ impl Global {
                            continue;
                        }
                        // optimize the tracked states
                        // cmdbuf.trackers.optimize();
                        {
                            profiling::scope!("update submission ids");
                            let cmd_buf_data = cmdbuf.data.lock();
                            let cmd_buf_trackers = &cmd_buf_data.as_ref().unwrap().trackers;
                            // update submission IDs
                            {
                                profiling::scope!("buffers");
                                for buffer in cmd_buf_trackers.buffers.used_resources() {
                                    if buffer.raw.get(&snatch_guard).is_none() {
                                        return Err(QueueSubmitError::DestroyedBuffer(
@ -1246,6 +1250,9 @@ impl Global {
                                        }
                                    }
                                }
                            }
                            {
                                profiling::scope!("textures");
                                for texture in cmd_buf_trackers.textures.used_resources() {
                                    let should_extend = match texture.inner.get(&snatch_guard) {
                                        None => {
@ -1266,15 +1273,24 @@ impl Global {
                                    if should_extend {
                                        unsafe {
                                            used_surface_textures
-                                            .merge_single(&texture, None, hal::TextureUses::PRESENT)
+                                                .merge_single(
                                                    &texture,
                                                    None,
                                                    hal::TextureUses::PRESENT,
                                                )
                                                .unwrap();
                                        };
                                    }
                                }
                            }
                            {
                                profiling::scope!("views");
                                for texture_view in cmd_buf_trackers.views.used_resources() {
                                    texture_view.info.use_at(submit_index);
                                }
                            }
                            {
                                profiling::scope!("bind groups (+ referenced views/samplers)");
                                for bg in cmd_buf_trackers.bind_groups.used_resources() {
                                    bg.info.use_at(submit_index);
                                    // We need to update the submission indices for the contained
@ -1288,20 +1304,32 @@ impl Global {
                                    }
                                }
                            }
-                            // assert!(cmd_buf_trackers.samplers.is_empty());
+                            {
                                profiling::scope!("compute pipelines");
                                for compute_pipeline in
                                    cmd_buf_trackers.compute_pipelines.used_resources()
                                {
                                    compute_pipeline.info.use_at(submit_index);
                                }
                            }
                            {
                                profiling::scope!("render pipelines");
                                for render_pipeline in
                                    cmd_buf_trackers.render_pipelines.used_resources()
                                {
                                    render_pipeline.info.use_at(submit_index);
                                }
                            }
                            {
                                profiling::scope!("query sets");
                                for query_set in cmd_buf_trackers.query_sets.used_resources() {
                                    query_set.info.use_at(submit_index);
                                }
                            }
                            {
                                profiling::scope!(
                                    "render bundles (+ referenced pipelines/query sets)"
                                );
                                for bundle in cmd_buf_trackers.bundles.used_resources() {
                                    bundle.info.use_at(submit_index);
                                    // We need to update the submission indices for the contained
@ -1312,12 +1340,15 @@ impl Global {
                                    {
                                        render_pipeline.info.use_at(submit_index);
                                    }
-                                for query_set in bundle.used.query_sets.read().used_resources() {
+                                    for query_set in bundle.used.query_sets.read().used_resources()
                                    {
                                        query_set.info.use_at(submit_index);
                                    }
                                }
                            }
                        }
                        let mut baked = cmdbuf.from_arc_into_baked();
                        // execute resource transitions
                        unsafe {
                            baked
@ -1385,6 +1416,13 @@ impl Global {
                            raw: baked.encoder,
                            cmd_buffers: baked.list,
                        });
                        {
                            // This involves actually decrementing the ref count of all command buffer
                            // resources, so can be _very_ expensive.
                            profiling::scope!("drop command buffer trackers");
                            drop(baked.trackers);
                        }
                    }
                    log::trace!("Device after submission {}", submit_index);
--- a/wgpu/Cargo.toml
+++ b/wgpu/Cargo.toml
@ -84,9 +84,6 @@ naga-ir = ["dep:naga"]
 ## to the validation carried out at public APIs in all builds.
 strict_asserts = ["wgc?/strict_asserts", "wgt/strict_asserts"]
 ## Log all API entry points at info instead of trace level.
 api_log_info = ["wgc/api_log_info"]
 ## Enables serialization via `serde` on common wgpu types.
 serde = ["dep:serde", "wgc/serde"]
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@ -13,11 +13,21 @@ Usage: xtask <COMMAND>
 Commands:
  run-wasm
    Build and run web examples
    --release   Build in release mode
    --no-serve  Just build the generated files, don't serve them
  test
    Run tests
    --llvm-cov  Run tests with LLVM code coverage using the llvm-cov tool
    --list      List all of the tests and their executables without running them
    --retries   Number of times to retry failing tests
  vendor-web-sys
    Re-vendor the WebGPU web-sys bindings.
    --no-cleanup        Don't clean up temporary checkout of wasm-bindgen
    One of:
        --path-to-checkout  Path to a local checkout of wasm-bindgen to generate bindings from.
--- a/xtask/src/run_wasm.rs
+++ b/xtask/src/run_wasm.rs
@ -5,7 +5,7 @@ use xshell::Shell;
 use crate::util::{check_all_programs, Program};
-pub(crate) fn run_wasm(shell: Shell, mut args: Arguments) -> Result<(), anyhow::Error> {
+pub(crate) fn run_wasm(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
    let no_serve = args.contains("--no-serve");
    let release = args.contains("--release");
--- a/xtask/src/test.rs
+++ b/xtask/src/test.rs
@ -4,6 +4,12 @@ use xshell::Shell;
 pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
    let llvm_cov = args.contains("--llvm-cov");
    let list = args.contains("--list");
    let retries = args
        .opt_value_from_str("--retries")?
        .unwrap_or(0_u32)
        .to_string();
    // These needs to match the command in "run wgpu-info" in `.github/workflows/ci.yml`
    let llvm_cov_flags: &[_] = if llvm_cov {
        &["llvm-cov", "--no-cfg-coverage", "--no-report"]
@ -12,16 +18,28 @@ pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
    };
    let llvm_cov_nextest_flags: &[_] = if llvm_cov {
        &["llvm-cov", "--no-cfg-coverage", "--no-report", "nextest"]
    } else {
        if list {
            &["nextest", "list"]
        } else {
            &["nextest", "run"]
        }
    };
    log::info!("Generating .gpuconfig file based on gpus on the system");
-    xshell::cmd!(
+    shell
-        shell,
+        .cmd("cargo")
-        "cargo {llvm_cov_flags...} run --bin wgpu-info -- --json -o .gpuconfig"
+        .args(llvm_cov_flags)
-    )
+        .args([
            "run",
            "--bin",
            "wgpu-info",
            "--",
            "--json",
            "-o",
            ".gpuconfig",
        ])
        .quiet()
        .run()
        .context("Failed to run wgpu-info to generate .gpuconfig")?;
@ -39,12 +57,30 @@ pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
        if gpu_count == 1 { "" } else { "s" }
    );
    if list {
        log::info!("Listing tests");
        shell
            .cmd("cargo")
            .args(llvm_cov_nextest_flags)
            .args(["-v", "--benches", "--tests", "--all-features"])
            .args(args.finish())
            .run()
            .context("Failed to list tests")?;
        return Ok(());
    }
    log::info!("Running cargo tests");
-    xshell::cmd!(
+    shell
-        shell,
+        .cmd("cargo")
-        "cargo {llvm_cov_nextest_flags...} --all-features --no-fail-fast --retries 2"
+        .args(llvm_cov_nextest_flags)
-    )
+        .args([
            "--benches",
            "--tests",
            "--no-fail-fast",
            "--all-features",
            "--retries",
            &retries,
        ])
        .args(args.finish())
        .quiet()
        .run()
--- a/xtask/src/util.rs
+++ b/xtask/src/util.rs
@ -1,15 +1,15 @@
 use std::{io, process::Command};
 pub(crate) struct Program {
    pub binary_name: &'static str,
    pub crate_name: &'static str,
    pub binary_name: &'static str,
 }
 pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
-    let mut failed = Vec::new();
+    let mut failed_crates = Vec::new();
-    for Program {
+    for &Program {
        binary_name,
        crate_name,
        binary_name,
    } in programs
    {
        let mut cmd = Command::new(binary_name);
@ -21,7 +21,7 @@ pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
            }
            Err(e) if matches!(e.kind(), io::ErrorKind::NotFound) => {
                log::error!("Checking for {binary_name} in PATH: ❌");
-                failed.push(*crate_name);
+                failed_crates.push(crate_name);
            }
            Err(e) => {
                log::error!("Checking for {binary_name} in PATH: ❌");
@ -30,12 +30,13 @@ pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
        }
    }
-    if !failed.is_empty() {
+    if !failed_crates.is_empty() {
        log::error!(
            "Please install them with: cargo install {}",
-            failed.join(" ")
+            failed_crates.join(" ")
        );
-        anyhow::bail!("Missing programs in PATH");
+
        anyhow::bail!("Missing required programs");
    }
    Ok(())