From eeb1a9d7b751da1fd14768809ff55f15b9056504 Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Thu, 16 May 2024 09:05:41 -0400
Subject: [PATCH] Add Benchmarks (#5694)

---
 .config/nextest.toml                     |  14 +-
 .deny.toml                               |   2 +
 .github/workflows/ci.yml                 |   2 +-
 Cargo.lock                               | 203 +++++++-
 Cargo.toml                               |  14 +-
 benches/Cargo.toml                       |  46 ++
 benches/README.md                        |  95 ++++
 benches/benches/renderpass-bindless.wgsl |  26 +
 benches/benches/renderpass.rs            | 573 +++++++++++++++++++++++
 benches/benches/renderpass.wgsl          |  36 ++
 benches/benches/resource_creation.rs     |  71 +++
 benches/benches/root.rs                  |  65 +++
 benches/benches/shader.rs                | 355 ++++++++++++++
 naga/Cargo.toml                          |   8 -
 naga/benches/criterion.rs                | 273 -----------
 naga/fuzz/Cargo.toml                     |   4 +
 naga/src/back/hlsl/help.rs               |   7 +-
 wgpu-core/src/command/memory_init.rs     |   4 +
 wgpu-core/src/device/mod.rs              |   4 +-
 wgpu-core/src/device/queue.rs            | 152 +++---
 wgpu/Cargo.toml                          |   3 -
 xtask/src/main.rs                        |  10 +
 xtask/src/run_wasm.rs                    |   2 +-
 xtask/src/test.rs                        |  68 ++-
 xtask/src/util.rs                        |  17 +-
 25 files changed, 1674 insertions(+), 380 deletions(-)
 create mode 100644 benches/Cargo.toml
 create mode 100644 benches/README.md
 create mode 100644 benches/benches/renderpass-bindless.wgsl
 create mode 100644 benches/benches/renderpass.rs
 create mode 100644 benches/benches/renderpass.wgsl
 create mode 100644 benches/benches/resource_creation.rs
 create mode 100644 benches/benches/root.rs
 create mode 100644 benches/benches/shader.rs
 delete mode 100644 naga/benches/criterion.rs

diff --git a/.config/nextest.toml b/.config/nextest.toml
index b8dbfe952..3d5a23b65 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -3,7 +3,17 @@
 [profile.default]
 slow-timeout = { period = "45s", terminate-after = 2 }
 
-# Use two threads for tests with "2_threads" in their name
+# Use two threads for tests with "2 threads" in their name
 [[profile.default.overrides]]
-filter = 'test(~2_threads)'
+filter = 'test(~2_threads) | test(~2 threads)'
 threads-required = 2
+
+# Use four threads for tests with "4 threads" in their name
+[[profile.default.overrides]]
+filter = 'test(~4_threads) | test(~4 threads)'
+threads-required = 4
+
+# Use eight threads for tests with "8 threads" in their name
+[[profile.default.overrides]]
+filter = 'test(~8_threads) | test(~8 threads)'
+threads-required = 8
diff --git a/.deny.toml b/.deny.toml
index 7e000d6f8..8448c81e8 100644
--- a/.deny.toml
+++ b/.deny.toml
@@ -1,6 +1,8 @@
 [bans]
 multiple-versions = "deny"
 skip-tree = [
+	# We never enable loom in any of our dependencies but it causes dupes
+	{ name = "loom", version = "0.7.2" },
 	{ name = "windows-sys", version = "0.45" },
 	{ name = "winit", version = "0.27" },
 	{ name = "winit", version = "0.29" },
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 55020c173..e2723f2ce 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -226,7 +226,7 @@ jobs:
           cargo clippy --target ${{ matrix.target }} --no-default-features
 
           # Check with all features.
-          cargo clippy --target ${{ matrix.target }} --tests --all-features
+          cargo clippy --target ${{ matrix.target }} --tests --benches --all-features
 
           # build docs
           cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} --all-features --no-deps
diff --git a/Cargo.lock b/Cargo.lock
index 83bdcc7c5..1e1cdd65d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1511,6 +1511,20 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "generator"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186014d53bc231d0090ef8d6f03e0920c54d85a5ed22f4f2f74315ec56cf83fb"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "log",
+ "rustversion",
+ "windows 0.54.0",
+]
+
 [[package]]
 name = "gethostname"
 version = "0.4.3"
@@ -1672,7 +1686,7 @@ dependencies = [
  "presser",
  "thiserror",
  "winapi",
- "windows",
+ "windows 0.52.0",
 ]
 
 [[package]]
@@ -2047,6 +2061,19 @@ version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
+[[package]]
+name = "loom"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca"
+dependencies = [
+ "cfg-if",
+ "generator",
+ "scoped-tls",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "malloc_buf"
 version = "0.0.6"
@@ -2056,6 +2083,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.2"
@@ -2141,11 +2177,9 @@ version = "0.20.0"
 dependencies = [
  "arbitrary",
  "arrayvec 0.7.4",
- "bincode",
  "bit-set",
  "bitflags 2.5.0",
  "codespan-reporting",
- "criterion",
  "diff",
  "env_logger",
  "hexf-parse",
@@ -2326,6 +2360,16 @@ dependencies = [
  "rand_xorshift",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.5"
@@ -2513,6 +2557,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "owned_ttf_parser"
 version = "0.21.0"
@@ -2892,8 +2942,17 @@ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.6",
+ "regex-syntax 0.8.3",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -2904,9 +2963,15 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.3",
 ]
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "regex-syntax"
 version = "0.8.3"
@@ -3138,6 +3203,15 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shared_library"
 version = "0.1.9"
@@ -3410,6 +3484,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
 [[package]]
 name = "threadpool"
 version = "1.8.1"
@@ -3567,6 +3651,59 @@ name = "tracing-core"
 version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+]
+
+[[package]]
+name = "tracy-client"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59fb931a64ff88984f86d3e9bcd1ae8843aa7fe44dd0f8097527bc172351741d"
+dependencies = [
+ "loom",
+ "once_cell",
+ "tracy-client-sys",
+]
+
+[[package]]
+name = "tracy-client-sys"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d104d610dfa9dd154535102cc9c6164ae1fa37842bc2d9e83f9ac82b0ae0882"
+dependencies = [
+ "cc",
+]
 
 [[package]]
 name = "ttf-parser"
@@ -3716,6 +3853,12 @@ dependencies = [
  "which",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
 [[package]]
 name = "vec_map"
 version = "0.8.2"
@@ -4077,6 +4220,23 @@ dependencies = [
  "wgpu-types",
 ]
 
+[[package]]
+name = "wgpu-benchmark"
+version = "0.20.0"
+dependencies = [
+ "bincode",
+ "bytemuck",
+ "criterion",
+ "naga",
+ "nanorand",
+ "once_cell",
+ "pollster",
+ "profiling",
+ "rayon",
+ "tracy-client",
+ "wgpu",
+]
+
 [[package]]
 name = "wgpu-core"
 version = "0.20.0"
@@ -4304,7 +4464,17 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
- "windows-core",
+ "windows-core 0.52.0",
+ "windows-targets 0.52.5",
+]
+
+[[package]]
+name = "windows"
+version = "0.54.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9252e5725dbed82865af151df558e754e4a3c2c30818359eb17465f1346a1b49"
+dependencies = [
+ "windows-core 0.54.0",
  "windows-targets 0.52.5",
 ]
 
@@ -4317,6 +4487,25 @@ dependencies = [
  "windows-targets 0.52.5",
 ]
 
+[[package]]
+name = "windows-core"
+version = "0.54.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12661b9c89351d684a50a8a643ce5f608e20243b9fb84687800163429f161d65"
+dependencies = [
+ "windows-result",
+ "windows-targets 0.52.5",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "749f0da9cc72d82e600d8d2e44cadd0b9eedb9038f71a1c58556ac1c5791813b"
+dependencies = [
+ "windows-targets 0.52.5",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.36.1"
diff --git a/Cargo.toml b/Cargo.toml
index bfcc19e7f..7d142df64 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,8 +5,9 @@ members = [
     "deno_webgpu",
 
     # default members
+    "benches",
     "d3d12",
-    "examples/",
+    "examples",
     "naga-cli",
     "naga",
     "naga/fuzz",
@@ -22,8 +23,9 @@ members = [
 ]
 exclude = []
 default-members = [
+    "benches",
     "d3d12",
-    "examples/",
+    "examples",
     "naga-cli",
     "naga",
     "naga/fuzz",
@@ -70,11 +72,13 @@ version = "0.20.0"
 [workspace.dependencies]
 anyhow = "1.0.23"
 arrayvec = "0.7"
+bincode = "1"
 bit-vec = "0.6"
 bitflags = "2"
 bytemuck = { version = "1.14", features = ["derive"] }
 cfg_aliases = "0.1"
 cfg-if = "1"
+criterion = "0.5"
 codespan-reporting = "0.11"
 ctor = "0.2"
 document-features = "0.2.8"
@@ -109,6 +113,7 @@ png = "0.17.11"
 pollster = "0.3"
 profiling = { version = "1", default-features = false }
 raw-window-handle = "0.6"
+rayon = "1"
 renderdoc-sys = "1.1.0"
 ron = "0.8"
 rustc-hash = "1.1.0"
@@ -116,6 +121,7 @@ serde = "1"
 serde_json = "1.0.116"
 smallvec = "1"
 static_assertions = "1.1.0"
+tracy-client = "0.17"
 thiserror = "1"
 wgpu = { version = "0.20.0", path = "./wgpu" }
 wgpu-core = { version = "0.20.0", path = "./wgpu-core" }
@@ -187,6 +193,10 @@ termcolor = "1.4.1"
 #js-sys = { path = "../wasm-bindgen/crates/js-sys" }
 #wasm-bindgen = { path = "../wasm-bindgen" }
 
+[profile.release]
+lto = "thin"
+debug = true
+
 # Speed up image comparison even in debug builds
 [profile.dev.package."nv-flip-sys"]
 opt-level = 3
diff --git a/benches/Cargo.toml b/benches/Cargo.toml
new file mode 100644
index 000000000..65ac0eefd
--- /dev/null
+++ b/benches/Cargo.toml
@@ -0,0 +1,46 @@
+[package]
+name = "wgpu-benchmark"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+description = "wgpu benchmarking suite"
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+license.workspace = true
+autobenches = false
+publish = false
+
+[[bench]]
+name = "root"
+harness = false
+path = "benches/root.rs"
+
+[features]
+# Uncomment these features to enable tracy and superluminal profiling.
+# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
+# superluminal = ["profiling/profile-with-superluminal"]
+
+[dependencies]
+bincode.workspace = true
+bytemuck.workspace = true
+criterion.workspace = true
+naga = { workspace = true, features = [
+    "deserialize",
+    "serialize",
+    "wgsl-in",
+    "spv-in",
+    "glsl-in",
+    "spv-out",
+    "msl-out",
+    "hlsl-out",
+    "glsl-out",
+    "wgsl-out",
+] }
+nanorand.workspace = true
+once_cell.workspace = true
+pollster.workspace = true
+profiling.workspace = true
+rayon.workspace = true
+tracy-client = { workspace = true, optional = true }
+wgpu.workspace = true
diff --git a/benches/README.md b/benches/README.md
new file mode 100644
index 000000000..3f20cbba7
--- /dev/null
+++ b/benches/README.md
@@ -0,0 +1,95 @@
+Collection of CPU benchmarks for `wgpu`.
+
+These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
+They all do very little GPU work and are testing the CPU performance of the API.
+
+Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
+
+## Usage
+
+```sh
+# Run all benchmarks
+cargo bench -p wgpu-benchmark
+# Run a specific benchmarks that contains "filter" in its name
+cargo bench -p wgpu-benchmark -- "filter"
+```
+
+## Benchmarks
+
+#### `Renderpass`
+
+This benchmark measures the performance of recording and submitting a render pass with a large
+number of draw calls and resources, emulating an intense, more traditional graphics application. 
+By default it measures 10k draw calls, with 90k total resources.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the render pass into multiple passes over multiple command buffers.
+
+#### `Resource Creation`
+
+This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
+
+#### `Shader Compilation`
+
+This benchmark measures the performance of naga parsing, validating, and generating shaders. 
+
+## Comparing Against a Baseline
+
+To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
+
+For example, to compare v0.20 against trunk, you could run the following:
+
+```sh
+git checkout v0.20
+
+# Run the baseline benchmarks
+cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
+
+git checkout trunk
+
+# Run the current benchmarks
+cargo bench -p wgpu-benchmark -- --baseline "v0.20"
+```
+
+You can use this for any bits of code you want to compare.
+
+## Integration with Profilers
+
+The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
+Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
+you need to uncomment the features in the `Cargo.toml` to allow features to be used.
+
+#### Tracy
+
+Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/tracy/releases/latest/).
+
+```sh
+# Once this is running, you can connect to it with the Tracy Profiler
+cargo bench -p wgpu-benchmark --features tracy
+```
+
+#### Superluminal
+
+Superluminal is a paid product for windows available [here](https://superluminal.eu/).
+
+```sh
+# This command will build the benchmarks, and display the path to the executable
+cargo bench -p wgpu-benchmark --features superluminal -- -h
+
+# Have Superluminal run the following command (replacing with the path to the executable)
+./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
+```
+
+#### `perf` and others
+
+You can follow the same pattern as above to run the benchmarks with other profilers.
+For example, the command line tool `perf` can be used to profile the benchmarks.
+
+```sh
+# This command will build the benchmarks, and display the path to the executable
+cargo bench -p wgpu-benchmark -- -h
+
+# Run the benchmarks with perf
+perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
+```
+
diff --git a/benches/benches/renderpass-bindless.wgsl b/benches/benches/renderpass-bindless.wgsl
new file mode 100644
index 000000000..0277ef63b
--- /dev/null
+++ b/benches/benches/renderpass-bindless.wgsl
@@ -0,0 +1,26 @@
+@group(0) @binding(0)
+var tex: binding_array<texture_2d<f32>>;
+
+struct VertexOutput {
+    @builtin(position) position: vec4f,
+    @location(0) @interpolate(flat) instance_index: u32,
+}
+
+@vertex
+fn vs_main(@builtin(instance_index) instance_index: u32) -> VertexOutput {
+    return VertexOutput(
+        vec4f(0.0, 0.0, 0.0, 1.0),
+        instance_index
+    );
+}
+
+@fragment
+fn fs_main(vs_in: VertexOutput) -> @location(0) vec4f {
+    return textureLoad(tex[7 * vs_in.instance_index + 0], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 1], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 2], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 3], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 4], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 5], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 6], vec2u(0), 0); 
+}
diff --git a/benches/benches/renderpass.rs b/benches/benches/renderpass.rs
new file mode 100644
index 000000000..30543839a
--- /dev/null
+++ b/benches/benches/renderpass.rs
@@ -0,0 +1,573 @@
+use std::{
+    num::NonZeroU32,
+    time::{Duration, Instant},
+};
+
+use criterion::{criterion_group, Criterion, Throughput};
+use nanorand::{Rng, WyRand};
+use once_cell::sync::Lazy;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::DeviceState;
+
+const DRAW_COUNT: usize = 10_000;
+// Must match the number of textures in the renderpass.wgsl shader
+const TEXTURES_PER_DRAW: usize = 7;
+const VERTEX_BUFFERS_PER_DRAW: usize = 2;
+const VERTEX_BUFFER_COUNT: usize = DRAW_COUNT * VERTEX_BUFFERS_PER_DRAW;
+
+const TEXTURE_COUNT: usize = DRAW_COUNT * TEXTURES_PER_DRAW;
+
+struct RenderpassState {
+    device_state: DeviceState,
+    pipeline: wgpu::RenderPipeline,
+    bind_groups: Vec<wgpu::BindGroup>,
+    vertex_buffers: Vec<wgpu::Buffer>,
+    index_buffers: Vec<wgpu::Buffer>,
+    render_target: wgpu::TextureView,
+
+    // Bindless resources
+    bindless_bind_group: Option<wgpu::BindGroup>,
+    bindless_pipeline: Option<wgpu::RenderPipeline>,
+}
+
+impl RenderpassState {
+    /// Create and prepare all the resources needed for the renderpass benchmark.
+    fn new() -> Self {
+        let device_state = DeviceState::new();
+
+        let supports_bindless = device_state.device.features().contains(
+            wgpu::Features::TEXTURE_BINDING_ARRAY
+                | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
+        ) && device_state
+            .device
+            .limits()
+            .max_sampled_textures_per_shader_stage
+            >= TEXTURE_COUNT as _;
+
+        // Performance gets considerably worse if the resources are shuffled.
+        //
+        // This more closely matches the real-world use case where resources have no
+        // well defined usage order.
+        let mut random = WyRand::new_seed(0x8BADF00D);
+
+        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+        for i in 0..TEXTURES_PER_DRAW {
+            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
+                binding: i as u32,
+                visibility: wgpu::ShaderStages::FRAGMENT,
+                ty: wgpu::BindingType::Texture {
+                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                    view_dimension: wgpu::TextureViewDimension::D2,
+                    multisampled: false,
+                },
+                count: None,
+            });
+        }
+
+        let bind_group_layout =
+            device_state
+                .device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: None,
+                    entries: &bind_group_layout_entries,
+                });
+
+        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
+        for i in 0..TEXTURE_COUNT {
+            let texture = device_state
+                .device
+                .create_texture(&wgpu::TextureDescriptor {
+                    label: Some(&format!("Texture {i}")),
+                    size: wgpu::Extent3d {
+                        width: 1,
+                        height: 1,
+                        depth_or_array_layers: 1,
+                    },
+                    mip_level_count: 1,
+                    sample_count: 1,
+                    dimension: wgpu::TextureDimension::D2,
+                    format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                    usage: wgpu::TextureUsages::TEXTURE_BINDING,
+                    view_formats: &[],
+                });
+            texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
+                label: Some(&format!("Texture View {i}")),
+                ..Default::default()
+            }));
+        }
+        random.shuffle(&mut texture_views);
+
+        let texture_view_refs: Vec<_> = texture_views.iter().collect();
+
+        let mut bind_groups = Vec::with_capacity(DRAW_COUNT);
+        for draw_idx in 0..DRAW_COUNT {
+            let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+            for tex_idx in 0..TEXTURES_PER_DRAW {
+                entries.push(wgpu::BindGroupEntry {
+                    binding: tex_idx as u32,
+                    resource: wgpu::BindingResource::TextureView(
+                        &texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
+                    ),
+                });
+            }
+
+            bind_groups.push(
+                device_state
+                    .device
+                    .create_bind_group(&wgpu::BindGroupDescriptor {
+                        label: None,
+                        layout: &bind_group_layout,
+                        entries: &entries,
+                    }),
+            );
+        }
+        random.shuffle(&mut bind_groups);
+
+        let sm = device_state
+            .device
+            .create_shader_module(wgpu::include_wgsl!("renderpass.wgsl"));
+
+        let pipeline_layout =
+            device_state
+                .device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: None,
+                    bind_group_layouts: &[&bind_group_layout],
+                    push_constant_ranges: &[],
+                });
+
+        let mut vertex_buffers = Vec::with_capacity(VERTEX_BUFFER_COUNT);
+        for _ in 0..VERTEX_BUFFER_COUNT {
+            vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: 3 * 16,
+                usage: wgpu::BufferUsages::VERTEX,
+                mapped_at_creation: false,
+            }));
+        }
+        random.shuffle(&mut vertex_buffers);
+
+        let mut index_buffers = Vec::with_capacity(DRAW_COUNT);
+        for _ in 0..DRAW_COUNT {
+            index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: 3 * 4,
+                usage: wgpu::BufferUsages::INDEX,
+                mapped_at_creation: false,
+            }));
+        }
+        random.shuffle(&mut index_buffers);
+
+        let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        for i in 0..VERTEX_BUFFERS_PER_DRAW {
+            vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
+        }
+
+        let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        for attributes in &vertex_buffer_attributes {
+            vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
+                array_stride: 16,
+                step_mode: wgpu::VertexStepMode::Vertex,
+                attributes,
+            });
+        }
+
+        let pipeline =
+            device_state
+                .device
+                .create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+                    label: None,
+                    layout: Some(&pipeline_layout),
+                    vertex: wgpu::VertexState {
+                        module: &sm,
+                        entry_point: "vs_main",
+                        buffers: &vertex_buffer_layouts,
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    },
+                    primitive: wgpu::PrimitiveState {
+                        topology: wgpu::PrimitiveTopology::TriangleList,
+                        strip_index_format: None,
+                        front_face: wgpu::FrontFace::Cw,
+                        cull_mode: Some(wgpu::Face::Back),
+                        polygon_mode: wgpu::PolygonMode::Fill,
+                        unclipped_depth: false,
+                        conservative: false,
+                    },
+                    depth_stencil: None,
+                    multisample: wgpu::MultisampleState::default(),
+                    fragment: Some(wgpu::FragmentState {
+                        module: &sm,
+                        entry_point: "fs_main",
+                        targets: &[Some(wgpu::ColorTargetState {
+                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                            blend: None,
+                            write_mask: wgpu::ColorWrites::ALL,
+                        })],
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    }),
+                    multiview: None,
+                });
+
+        let render_target = device_state
+            .device
+            .create_texture(&wgpu::TextureDescriptor {
+                label: Some("Render Target"),
+                size: wgpu::Extent3d {
+                    width: 1,
+                    height: 1,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
+                view_formats: &[],
+            })
+            .create_view(&wgpu::TextureViewDescriptor::default());
+
+        let mut bindless_bind_group = None;
+        let mut bindless_pipeline = None;
+
+        if supports_bindless {
+            let bindless_bind_group_layout =
+                device_state
+                    .device
+                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                        label: None,
+                        entries: &[wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::FRAGMENT,
+                            ty: wgpu::BindingType::Texture {
+                                sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                                view_dimension: wgpu::TextureViewDimension::D2,
+                                multisampled: false,
+                            },
+                            count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
+                        }],
+                    });
+
+            bindless_bind_group = Some(device_state.device.create_bind_group(
+                &wgpu::BindGroupDescriptor {
+                    label: None,
+                    layout: &bindless_bind_group_layout,
+                    entries: &[wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
+                    }],
+                },
+            ));
+
+            let bindless_shader_module = device_state
+                .device
+                .create_shader_module(wgpu::include_wgsl!("renderpass-bindless.wgsl"));
+
+            let bindless_pipeline_layout =
+                device_state
+                    .device
+                    .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                        label: None,
+                        bind_group_layouts: &[&bindless_bind_group_layout],
+                        push_constant_ranges: &[],
+                    });
+
+            bindless_pipeline = Some(device_state.device.create_render_pipeline(
+                &wgpu::RenderPipelineDescriptor {
+                    label: None,
+                    layout: Some(&bindless_pipeline_layout),
+                    vertex: wgpu::VertexState {
+                        module: &bindless_shader_module,
+                        entry_point: "vs_main",
+                        buffers: &vertex_buffer_layouts,
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    },
+                    primitive: wgpu::PrimitiveState {
+                        topology: wgpu::PrimitiveTopology::TriangleList,
+                        strip_index_format: None,
+                        front_face: wgpu::FrontFace::Cw,
+                        cull_mode: Some(wgpu::Face::Back),
+                        polygon_mode: wgpu::PolygonMode::Fill,
+                        unclipped_depth: false,
+                        conservative: false,
+                    },
+                    depth_stencil: None,
+                    multisample: wgpu::MultisampleState::default(),
+                    fragment: Some(wgpu::FragmentState {
+                        module: &bindless_shader_module,
+                        entry_point: "fs_main",
+                        targets: &[Some(wgpu::ColorTargetState {
+                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                            blend: None,
+                            write_mask: wgpu::ColorWrites::ALL,
+                        })],
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    }),
+                    multiview: None,
+                },
+            ));
+        }
+
+        Self {
+            device_state,
+            pipeline,
+            bind_groups,
+            vertex_buffers,
+            index_buffers,
+            render_target,
+
+            bindless_bind_group,
+            bindless_pipeline,
+        }
+    }
+
+    fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
+        profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
+
+        let draws_per_pass = DRAW_COUNT / total_passes;
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &self.render_target,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            occlusion_query_set: None,
+            timestamp_writes: None,
+            depth_stencil_attachment: None,
+        });
+
+        let start_idx = pass_number * draws_per_pass;
+        let end_idx = start_idx + draws_per_pass;
+        for draw_idx in start_idx..end_idx {
+            render_pass.set_pipeline(&self.pipeline);
+            render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
+            for i in 0..VERTEX_BUFFERS_PER_DRAW {
+                render_pass.set_vertex_buffer(
+                    i as u32,
+                    self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
+                );
+            }
+            render_pass.set_index_buffer(
+                self.index_buffers[draw_idx].slice(..),
+                wgpu::IndexFormat::Uint32,
+            );
+            render_pass.draw_indexed(0..3, 0, 0..1);
+        }
+
+        drop(render_pass);
+
+        encoder.finish()
+    }
+
+    fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
+        profiling::scope!("Bindless Renderpass");
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &self.render_target,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            occlusion_query_set: None,
+            timestamp_writes: None,
+            depth_stencil_attachment: None,
+        });
+
+        render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
+        render_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
+        for i in 0..VERTEX_BUFFERS_PER_DRAW {
+            render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
+        }
+        render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
+
+        for draw_idx in 0..DRAW_COUNT {
+            render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
+        }
+
+        drop(render_pass);
+
+        encoder.finish()
+    }
+}
+
+fn run_bench(ctx: &mut Criterion) {
+    let state = Lazy::new(RenderpassState::new);
+
+    // Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
+    let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    for time_submit in [false, true] {
+        for rpasses in [1, 2, 4, 8] {
+            let draws_per_pass = DRAW_COUNT / rpasses;
+
+            let label = if time_submit {
+                "Submit Time"
+            } else {
+                "Renderpass Time"
+            };
+
+            group.bench_function(
+                &format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
+                |b| {
+                    Lazy::force(&state);
+
+                    b.iter_custom(|iters| {
+                        profiling::scope!("benchmark invocation");
+
+                        // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+                        if state.device_state.adapter_info.name.contains("Paravirtual") {
+                            return Duration::from_secs_f32(1.0);
+                        }
+
+                        let mut duration = Duration::ZERO;
+
+                        for _ in 0..iters {
+                            profiling::scope!("benchmark iteration");
+
+                            let mut start = Instant::now();
+
+                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
+                            for i in 0..rpasses {
+                                buffers.push(state.run_subpass(i, rpasses));
+                            }
+
+                            if time_submit {
+                                start = Instant::now();
+                            } else {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.queue.submit(buffers);
+
+                            if time_submit {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.device.poll(wgpu::Maintain::Wait);
+                        }
+
+                        duration
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+
+    // Test 10k draw calls split up over 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    for threads in [2, 4, 8] {
+        let draws_per_pass = DRAW_COUNT / threads;
+        group.bench_function(
+            &format!("{threads} threads x {draws_per_pass} draws"),
+            |b| {
+                Lazy::force(&state);
+
+                b.iter_custom(|iters| {
+                    profiling::scope!("benchmark invocation");
+
+                    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+                    if state.device_state.adapter_info.name.contains("Paravirtual") {
+                        return Duration::from_secs_f32(1.0);
+                    }
+
+                    let mut duration = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        profiling::scope!("benchmark iteration");
+
+                        let start = Instant::now();
+
+                        let buffers = (0..threads)
+                            .into_par_iter()
+                            .map(|i| state.run_subpass(i, threads))
+                            .collect::<Vec<_>>();
+
+                        duration += start.elapsed();
+
+                        state.device_state.queue.submit(buffers);
+                        state.device_state.device.poll(wgpu::Maintain::Wait);
+                    }
+
+                    duration
+                })
+            },
+        );
+    }
+    group.finish();
+
+    // Test 10k draw calls split up over 1, 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Renderpass: Bindless");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    group.bench_function(&format!("{DRAW_COUNT} draws"), |b| {
+        Lazy::force(&state);
+
+        b.iter_custom(|iters| {
+            profiling::scope!("benchmark invocation");
+
+            // Need bindless to run this benchmark
+            if state.bindless_bind_group.is_none() {
+                return Duration::from_secs_f32(1.0);
+            }
+
+            let mut duration = Duration::ZERO;
+
+            for _ in 0..iters {
+                profiling::scope!("benchmark iteration");
+
+                let start = Instant::now();
+
+                let buffer = state.run_bindless_pass();
+
+                duration += start.elapsed();
+
+                state.device_state.queue.submit([buffer]);
+                state.device_state.device.poll(wgpu::Maintain::Wait);
+            }
+
+            duration
+        })
+    });
+    group.finish();
+
+    ctx.bench_function(
+        &format!(
+            "Renderpass: Empty Submit with {} Resources",
+            TEXTURE_COUNT + VERTEX_BUFFER_COUNT
+        ),
+        |b| {
+            Lazy::force(&state);
+
+            b.iter(|| state.device_state.queue.submit([]));
+        },
+    );
+}
+
+criterion_group! {
+    name = renderpass;
+    config = Criterion::default().measurement_time(Duration::from_secs(10));
+    targets = run_bench,
+}
diff --git a/benches/benches/renderpass.wgsl b/benches/benches/renderpass.wgsl
new file mode 100644
index 000000000..948fd6e2f
--- /dev/null
+++ b/benches/benches/renderpass.wgsl
@@ -0,0 +1,36 @@
+@group(0) @binding(0)
+var tex_1: texture_2d<f32>;
+
+@group(0) @binding(1)
+var tex_2: texture_2d<f32>;
+
+@group(0) @binding(2)
+var tex_3: texture_2d<f32>;
+
+@group(0) @binding(3)
+var tex_4: texture_2d<f32>;
+
+@group(0) @binding(4)
+var tex_5: texture_2d<f32>;
+
+@group(0) @binding(5)
+var tex_6: texture_2d<f32>;
+
+@group(0) @binding(6)
+var tex_7: texture_2d<f32>;
+
+@vertex
+fn vs_main() -> @builtin(position) vec4f {
+    return vec4f(0.0, 0.0, 0.0, 1.0);
+}
+
+@fragment
+fn fs_main() -> @location(0) vec4f {
+    return textureLoad(tex_1, vec2u(0), 0) +
+           textureLoad(tex_2, vec2u(0), 0) +
+           textureLoad(tex_3, vec2u(0), 0) +
+           textureLoad(tex_4, vec2u(0), 0) +
+           textureLoad(tex_5, vec2u(0), 0) +
+           textureLoad(tex_6, vec2u(0), 0) +
+           textureLoad(tex_7, vec2u(0), 0); 
+}
diff --git a/benches/benches/resource_creation.rs b/benches/benches/resource_creation.rs
new file mode 100644
index 000000000..c23f132bb
--- /dev/null
+++ b/benches/benches/resource_creation.rs
@@ -0,0 +1,71 @@
+use std::time::{Duration, Instant};
+
+use criterion::{criterion_group, Criterion, Throughput};
+use once_cell::sync::Lazy;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::DeviceState;
+
+fn run_bench(ctx: &mut Criterion) {
+    let state = Lazy::new(DeviceState::new);
+
+    const RESOURCES_TO_CREATE: usize = 8;
+
+    let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
+    group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
+
+    for threads in [1, 2, 4, 8] {
+        let resources_per_thread = RESOURCES_TO_CREATE / threads;
+        group.bench_function(
+            &format!("{threads} threads x {resources_per_thread} resource"),
+            |b| {
+                Lazy::force(&state);
+
+                b.iter_custom(|iters| {
+                    profiling::scope!("benchmark invocation");
+
+                    let mut duration = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        profiling::scope!("benchmark iteration");
+
+                        // We can't create too many resources at once, so we do it 8 resources at a time.
+                        let start = Instant::now();
+
+                        let buffers = (0..threads)
+                            .into_par_iter()
+                            .map(|_| {
+                                (0..resources_per_thread)
+                                    .map(|_| {
+                                        state.device.create_buffer(&wgpu::BufferDescriptor {
+                                            label: None,
+                                            size: 256 * 1024 * 1024,
+                                            usage: wgpu::BufferUsages::COPY_DST,
+                                            mapped_at_creation: false,
+                                        })
+                                    })
+                                    .collect::<Vec<_>>()
+                            })
+                            .collect::<Vec<_>>();
+
+                        duration += start.elapsed();
+
+                        drop(buffers);
+
+                        state.queue.submit([]);
+                        state.device.poll(wgpu::Maintain::Wait);
+                    }
+
+                    duration
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group! {
+    name = resource_creation;
+    config = Criterion::default().measurement_time(Duration::from_secs(10));
+    targets = run_bench,
+}
diff --git a/benches/benches/root.rs b/benches/benches/root.rs
new file mode 100644
index 000000000..98563f839
--- /dev/null
+++ b/benches/benches/root.rs
@@ -0,0 +1,65 @@
+use criterion::criterion_main;
+use pollster::block_on;
+
+mod renderpass;
+mod resource_creation;
+mod shader;
+
+struct DeviceState {
+    adapter_info: wgpu::AdapterInfo,
+    device: wgpu::Device,
+    queue: wgpu::Queue,
+}
+
+impl DeviceState {
+    fn new() -> Self {
+        #[cfg(feature = "tracy")]
+        tracy_client::Client::start();
+
+        let base_backend = if cfg!(target_os = "macos") {
+            // We don't want to use Molten-VK on Mac.
+            wgpu::Backends::METAL
+        } else {
+            wgpu::Backends::all()
+        };
+
+        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
+            backends: wgpu::util::backend_bits_from_env().unwrap_or(base_backend),
+            flags: wgpu::InstanceFlags::empty(),
+            dx12_shader_compiler: wgpu::util::dx12_shader_compiler_from_env()
+                .unwrap_or(wgpu::Dx12Compiler::Fxc),
+            gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
+        });
+
+        let adapter = block_on(wgpu::util::initialize_adapter_from_env_or_default(
+            &instance, None,
+        ))
+        .unwrap();
+
+        let adapter_info = adapter.get_info();
+
+        eprintln!("{:?}", adapter_info);
+
+        let (device, queue) = block_on(adapter.request_device(
+            &wgpu::DeviceDescriptor {
+                required_features: adapter.features(),
+                required_limits: adapter.limits(),
+                label: Some("RenderPass Device"),
+            },
+            None,
+        ))
+        .unwrap();
+
+        Self {
+            adapter_info,
+            device,
+            queue,
+        }
+    }
+}
+
+criterion_main!(
+    renderpass::renderpass,
+    resource_creation::resource_creation,
+    shader::shader
+);
diff --git a/benches/benches/shader.rs b/benches/benches/shader.rs
new file mode 100644
index 000000000..6d20b6029
--- /dev/null
+++ b/benches/benches/shader.rs
@@ -0,0 +1,355 @@
+use criterion::*;
+use std::{fs, path::PathBuf};
+
+struct Input {
+    filename: String,
+    size: u64,
+    data: Vec<u8>,
+    string: Option<String>,
+    module: Option<naga::Module>,
+    module_info: Option<naga::valid::ModuleInfo>,
+}
+
+struct Inputs {
+    inner: Vec<Input>,
+}
+
+impl Inputs {
+    fn from_dir(folder: &str, extension: &str) -> Self {
+        let mut inputs = Vec::new();
+        let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join(folder)
+            .read_dir()
+            .unwrap();
+
+        for file_entry in read_dir {
+            match file_entry {
+                Ok(entry) => match entry.path().extension() {
+                    Some(ostr) if ostr == extension => {
+                        let path = entry.path();
+
+                        inputs.push(Input {
+                            filename: path.to_string_lossy().into_owned(),
+                            size: entry.metadata().unwrap().len(),
+                            string: None,
+                            data: vec![],
+                            module: None,
+                            module_info: None,
+                        });
+                    }
+                    _ => continue,
+                },
+                Err(e) => {
+                    eprintln!("Skipping file: {:?}", e);
+                    continue;
+                }
+            }
+        }
+
+        Self { inner: inputs }
+    }
+
+    fn bytes(&self) -> u64 {
+        self.inner.iter().map(|input| input.size).sum()
+    }
+
+    fn load(&mut self) {
+        for input in &mut self.inner {
+            if !input.data.is_empty() {
+                continue;
+            }
+
+            input.data = fs::read(&input.filename).unwrap_or_default();
+        }
+    }
+
+    fn load_utf8(&mut self) {
+        self.load();
+
+        for input in &mut self.inner {
+            if input.string.is_some() {
+                continue;
+            }
+
+            input.string = Some(std::str::from_utf8(&input.data).unwrap().to_string());
+        }
+    }
+
+    fn parse(&mut self) {
+        self.load_utf8();
+
+        let mut parser = naga::front::wgsl::Frontend::new();
+        for input in &mut self.inner {
+            if input.module.is_some() {
+                continue;
+            }
+
+            input.module = Some(parser.parse(input.string.as_ref().unwrap()).unwrap());
+        }
+    }
+
+    fn validate(&mut self) {
+        self.parse();
+
+        let mut validator = naga::valid::Validator::new(
+            naga::valid::ValidationFlags::all(),
+            // Note, this is empty, to let all backends work.
+            naga::valid::Capabilities::empty(),
+        );
+
+        for input in &mut self.inner {
+            if input.module_info.is_some() {
+                continue;
+            }
+
+            input.module_info = validator.validate(input.module.as_ref().unwrap()).ok();
+        }
+
+        self.inner.retain(|input| input.module_info.is_some());
+    }
+}
+
+fn parse_glsl(stage: naga::ShaderStage, inputs: &Inputs) {
+    let mut parser = naga::front::glsl::Frontend::default();
+    let options = naga::front::glsl::Options {
+        stage,
+        defines: Default::default(),
+    };
+    for input in &inputs.inner {
+        parser
+            .parse(&options, input.string.as_deref().unwrap())
+            .unwrap();
+    }
+}
+
+fn frontends(c: &mut Criterion) {
+    let mut group = c.benchmark_group("front");
+
+    let mut inputs_wgsl = Inputs::from_dir("../naga/tests/in", "wgsl");
+    group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
+    group.bench_function("shader: naga module bincode decode", |b| {
+        inputs_wgsl.parse();
+
+        let inputs_bin = inputs_wgsl
+            .inner
+            .iter()
+            .map(|input| bincode::serialize(&input.module.as_ref().unwrap()).unwrap())
+            .collect::<Vec<_>>();
+
+        b.iter(move || {
+            for input in inputs_bin.iter() {
+                bincode::deserialize::<naga::Module>(input).unwrap();
+            }
+        });
+    });
+
+    group.bench_function("shader: wgsl-in", |b| {
+        inputs_wgsl.load_utf8();
+
+        let mut frontend = naga::front::wgsl::Frontend::new();
+        b.iter(|| {
+            for input in &inputs_wgsl.inner {
+                frontend.parse(input.string.as_ref().unwrap()).unwrap();
+            }
+        });
+    });
+
+    let mut inputs_spirv = Inputs::from_dir("../naga/tests/in/spv", "spv");
+    group.throughput(Throughput::Bytes(inputs_spirv.bytes()));
+    group.bench_function("shader: spv-in", |b| {
+        inputs_spirv.load();
+
+        b.iter(|| {
+            let options = naga::front::spv::Options::default();
+            for input in &inputs_spirv.inner {
+                let spv = bytemuck::cast_slice(&input.data);
+                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
+                parser.parse().unwrap();
+            }
+        });
+    });
+
+    let mut inputs_vertex = Inputs::from_dir("../naga/tests/in/glsl", "vert");
+    let mut inputs_fragment = Inputs::from_dir("../naga/tests/in/glsl", "frag");
+    // let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
+    group.throughput(Throughput::Bytes(
+        inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
+    ));
+    group.bench_function("shader: glsl-in", |b| {
+        inputs_vertex.load();
+        inputs_vertex.load_utf8();
+        inputs_fragment.load_utf8();
+        // inputs_compute.load_utf8();
+
+        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
+        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
+        // TODO: This one hangs for some reason
+        // b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
+    });
+}
+
+fn validation(c: &mut Criterion) {
+    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
+
+    let mut group = c.benchmark_group("validate");
+    group.throughput(Throughput::Bytes(inputs.bytes()));
+    group.bench_function("shader: validation", |b| {
+        inputs.load();
+        inputs.load_utf8();
+        inputs.parse();
+
+        let mut validator = naga::valid::Validator::new(
+            naga::valid::ValidationFlags::all(),
+            naga::valid::Capabilities::all(),
+        );
+        validator
+            .subgroup_stages(naga::valid::ShaderStages::all())
+            .subgroup_operations(naga::valid::SubgroupOperationSet::all());
+        b.iter(|| {
+            for input in &inputs.inner {
+                validator.validate(input.module.as_ref().unwrap()).unwrap();
+            }
+        });
+    });
+    group.finish();
+}
+
+fn backends(c: &mut Criterion) {
+    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
+
+    let mut group = c.benchmark_group("back");
+    // While normally this would be done inside the bench_function callback, we need to
+    // run this to properly know the size of the inputs, as any that fail validation
+    // will be removed.
+    inputs.validate();
+
+    group.throughput(Throughput::Bytes(inputs.bytes()));
+    group.bench_function("shader: wgsl-out", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let flags = naga::back::wgsl::WriterFlags::empty();
+            for input in &inputs.inner {
+                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                );
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: spv-out", |b| {
+        b.iter(|| {
+            let mut data = Vec::new();
+            let options = naga::back::spv::Options::default();
+            for input in &inputs.inner {
+                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    None,
+                    &None,
+                    &mut data,
+                );
+                data.clear();
+            }
+        });
+    });
+    group.bench_function("shader: spv-out multiple entrypoints", |b| {
+        b.iter(|| {
+            let mut data = Vec::new();
+            let options = naga::back::spv::Options::default();
+            for input in &inputs.inner {
+                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
+                let module = input.module.as_ref().unwrap();
+                for ep in module.entry_points.iter() {
+                    let pipeline_options = naga::back::spv::PipelineOptions {
+                        shader_stage: ep.stage,
+                        entry_point: ep.name.clone(),
+                    };
+                    let _ = writer.write(
+                        input.module.as_ref().unwrap(),
+                        input.module_info.as_ref().unwrap(),
+                        Some(&pipeline_options),
+                        &None,
+                        &mut data,
+                    );
+                    data.clear();
+                }
+            }
+        });
+    });
+
+    group.bench_function("shader: msl-out", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let options = naga::back::msl::Options::default();
+            for input in &inputs.inner {
+                let pipeline_options = naga::back::msl::PipelineOptions::default();
+                let mut writer = naga::back::msl::Writer::new(&mut string);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    &options,
+                    &pipeline_options,
+                );
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: hlsl-out", |b| {
+        b.iter(|| {
+            let options = naga::back::hlsl::Options::default();
+            let mut string = String::new();
+            for input in &inputs.inner {
+                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                ); // may fail on unimplemented things
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: glsl-out multiple entrypoints", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let options = naga::back::glsl::Options {
+                version: naga::back::glsl::Version::new_gles(320),
+                writer_flags: naga::back::glsl::WriterFlags::empty(),
+                binding_map: Default::default(),
+                zero_initialize_workgroup_memory: true,
+            };
+            for input in &inputs.inner {
+                let module = input.module.as_ref().unwrap();
+                let info = input.module_info.as_ref().unwrap();
+                for ep in module.entry_points.iter() {
+                    let pipeline_options = naga::back::glsl::PipelineOptions {
+                        shader_stage: ep.stage,
+                        entry_point: ep.name.clone(),
+                        multiview: None,
+                    };
+
+                    // might be `Err` if missing features
+                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
+                        &mut string,
+                        module,
+                        info,
+                        &options,
+                        &pipeline_options,
+                        naga::proc::BoundsCheckPolicies::default(),
+                    ) {
+                        let _ = writer.write(); // might be `Err` if unsupported
+                    }
+
+                    string.clear();
+                }
+            }
+        });
+    });
+}
+
+criterion_group!(shader, frontends, validation, backends);
diff --git a/naga/Cargo.toml b/naga/Cargo.toml
index 3041a6009..22e172d47 100644
--- a/naga/Cargo.toml
+++ b/naga/Cargo.toml
@@ -35,10 +35,6 @@ wgsl-out = []
 hlsl-out = []
 compact = []
 
-[[bench]]
-name = "criterion"
-harness = false
-
 [dependencies]
 arbitrary = { version = "1.3", features = ["derive"], optional = true }
 bitflags = "2.5"
@@ -60,11 +56,7 @@ hexf-parse = { version = "0.2.1", optional = true }
 unicode-xid = { version = "0.2.3", optional = true }
 arrayvec.workspace = true
 
-[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
-criterion = { version = "0.5", features = [] }
-
 [dev-dependencies]
-bincode = "1"
 diff = "0.1"
 env_logger = "0.11"
 # This _cannot_ have a version specified. If it does, crates.io will look
diff --git a/naga/benches/criterion.rs b/naga/benches/criterion.rs
deleted file mode 100644
index e57c58a84..000000000
--- a/naga/benches/criterion.rs
+++ /dev/null
@@ -1,273 +0,0 @@
-#![cfg(not(target_arch = "wasm32"))]
-#![allow(clippy::needless_borrowed_reference)]
-
-use criterion::*;
-use std::{fs, path::PathBuf, slice};
-
-fn gather_inputs(folder: &str, extension: &str) -> Vec<Box<[u8]>> {
-    let mut list = Vec::new();
-    let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-        .join(folder)
-        .read_dir()
-        .unwrap();
-    for file_entry in read_dir {
-        match file_entry {
-            Ok(entry) => match entry.path().extension() {
-                Some(ostr) if ostr == extension => {
-                    let input = fs::read(entry.path()).unwrap_or_default();
-                    list.push(input.into_boxed_slice());
-                }
-                _ => continue,
-            },
-            Err(e) => {
-                log::warn!("Skipping file: {:?}", e);
-                continue;
-            }
-        }
-    }
-    list
-}
-
-fn parse_glsl(stage: naga::ShaderStage, inputs: &[Box<[u8]>]) {
-    let mut parser = naga::front::glsl::Frontend::default();
-    let options = naga::front::glsl::Options {
-        stage,
-        defines: Default::default(),
-    };
-    for input in inputs.iter() {
-        let string = std::str::from_utf8(input).unwrap();
-        parser.parse(&options, string).unwrap();
-    }
-}
-
-fn frontends(c: &mut Criterion) {
-    let mut group = c.benchmark_group("front");
-    #[cfg(all(feature = "wgsl-in", feature = "serialize", feature = "deserialize"))]
-    group.bench_function("bin", |b| {
-        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
-        let mut frontend = naga::front::wgsl::Frontend::new();
-        let inputs_bin = inputs_wgsl
-            .iter()
-            .map(|input| {
-                let string = std::str::from_utf8(input).unwrap();
-                let module = frontend.parse(string).unwrap();
-                bincode::serialize(&module).unwrap()
-            })
-            .collect::<Vec<_>>();
-        b.iter(move || {
-            for input in inputs_bin.iter() {
-                bincode::deserialize::<naga::Module>(input).unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "wgsl-in")]
-    group.bench_function("wgsl", |b| {
-        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
-        let inputs = inputs_wgsl
-            .iter()
-            .map(|input| std::str::from_utf8(input).unwrap())
-            .collect::<Vec<_>>();
-        let mut frontend = naga::front::wgsl::Frontend::new();
-        b.iter(move || {
-            for &input in inputs.iter() {
-                frontend.parse(input).unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "spv-in")]
-    group.bench_function("spv", |b| {
-        let inputs = gather_inputs("tests/in/spv", "spv");
-        b.iter(move || {
-            let options = naga::front::spv::Options::default();
-            for input in inputs.iter() {
-                let spv =
-                    unsafe { slice::from_raw_parts(input.as_ptr() as *const u32, input.len() / 4) };
-                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
-                parser.parse().unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "glsl-in")]
-    group.bench_function("glsl", |b| {
-        let vert = gather_inputs("tests/in/glsl", "vert");
-        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &vert));
-        let frag = gather_inputs("tests/in/glsl", "frag");
-        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &frag));
-        //TODO: hangs for some reason!
-        //let comp = gather_inputs("tests/in/glsl", "comp");
-        //b.iter(move || parse_glsl(naga::ShaderStage::Compute, &comp));
-    });
-}
-
-#[cfg(feature = "wgsl-in")]
-fn gather_modules() -> Vec<naga::Module> {
-    let inputs = gather_inputs("tests/in", "wgsl");
-    let mut frontend = naga::front::wgsl::Frontend::new();
-    inputs
-        .iter()
-        .map(|input| {
-            let string = std::str::from_utf8(input).unwrap();
-            frontend.parse(string).unwrap()
-        })
-        .collect()
-}
-#[cfg(not(feature = "wgsl-in"))]
-fn gather_modules() -> Vec<naga::Module> {
-    Vec::new()
-}
-
-fn validation(c: &mut Criterion) {
-    let inputs = gather_modules();
-    let mut group = c.benchmark_group("valid");
-    group.bench_function("safe", |b| {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::all(),
-            naga::valid::Capabilities::all(),
-        );
-        b.iter(|| {
-            for input in inputs.iter() {
-                validator.validate(input).unwrap();
-            }
-        });
-    });
-    group.bench_function("unsafe", |b| {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::empty(),
-            naga::valid::Capabilities::all(),
-        );
-        b.iter(|| {
-            for input in inputs.iter() {
-                validator.validate(input).unwrap();
-            }
-        });
-    });
-}
-
-fn backends(c: &mut Criterion) {
-    let inputs = {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::empty(),
-            naga::valid::Capabilities::default(),
-        );
-        let input_modules = gather_modules();
-        input_modules
-            .into_iter()
-            .flat_map(|module| validator.validate(&module).ok().map(|info| (module, info)))
-            .collect::<Vec<_>>()
-    };
-
-    let mut group = c.benchmark_group("back");
-    #[cfg(feature = "wgsl-out")]
-    group.bench_function("wgsl", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let flags = naga::back::wgsl::WriterFlags::empty();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
-                writer.write(module, info).unwrap();
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "spv-out")]
-    group.bench_function("spv", |b| {
-        b.iter(|| {
-            let mut data = Vec::new();
-            let options = naga::back::spv::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
-                writer.write(module, info, None, &None, &mut data).unwrap();
-                data.clear();
-            }
-        });
-    });
-    #[cfg(feature = "spv-out")]
-    group.bench_function("spv-separate", |b| {
-        b.iter(|| {
-            let mut data = Vec::new();
-            let options = naga::back::spv::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
-                for ep in module.entry_points.iter() {
-                    let pipeline_options = naga::back::spv::PipelineOptions {
-                        shader_stage: ep.stage,
-                        entry_point: ep.name.clone(),
-                    };
-                    writer
-                        .write(module, info, Some(&pipeline_options), &None, &mut data)
-                        .unwrap();
-                    data.clear();
-                }
-            }
-        });
-    });
-
-    #[cfg(feature = "msl-out")]
-    group.bench_function("msl", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let options = naga::back::msl::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let pipeline_options = naga::back::msl::PipelineOptions::default();
-                let mut writer = naga::back::msl::Writer::new(&mut string);
-                writer
-                    .write(module, info, &options, &pipeline_options)
-                    .unwrap();
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "hlsl-out")]
-    group.bench_function("hlsl", |b| {
-        b.iter(|| {
-            let options = naga::back::hlsl::Options::default();
-            let mut string = String::new();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
-                let _ = writer.write(module, info); // may fail on unimplemented things
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "glsl-out")]
-    group.bench_function("glsl-separate", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let options = naga::back::glsl::Options {
-                version: naga::back::glsl::Version::new_gles(320),
-                writer_flags: naga::back::glsl::WriterFlags::empty(),
-                binding_map: Default::default(),
-                zero_initialize_workgroup_memory: true,
-            };
-            for &(ref module, ref info) in inputs.iter() {
-                for ep in module.entry_points.iter() {
-                    let pipeline_options = naga::back::glsl::PipelineOptions {
-                        shader_stage: ep.stage,
-                        entry_point: ep.name.clone(),
-                        multiview: None,
-                    };
-
-                    // might be `Err` if missing features
-                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
-                        &mut string,
-                        module,
-                        info,
-                        &options,
-                        &pipeline_options,
-                        naga::proc::BoundsCheckPolicies::default(),
-                    ) {
-                        let _ = writer.write(); // might be `Err` if unsupported
-                    }
-
-                    string.clear();
-                }
-            }
-        });
-    });
-}
-
-criterion_group!(criterion, frontends, validation, backends,);
-criterion_main!(criterion);
diff --git a/naga/fuzz/Cargo.toml b/naga/fuzz/Cargo.toml
index 3e46af0c5..196919e44 100644
--- a/naga/fuzz/Cargo.toml
+++ b/naga/fuzz/Cargo.toml
@@ -21,23 +21,27 @@ features = ["arbitrary", "spv-in", "wgsl-in", "glsl-in"]
 [[bin]]
 name = "spv_parser"
 path = "fuzz_targets/spv_parser.rs"
+bench = false
 test = false
 doc = false
 
 [[bin]]
 name = "wgsl_parser"
 path = "fuzz_targets/wgsl_parser.rs"
+bench = false
 test = false
 doc = false
 
 [[bin]]
 name = "glsl_parser"
 path = "fuzz_targets/glsl_parser.rs"
+bench = false
 test = false
 doc = false
 
 [[bin]]
 name = "ir"
 path = "fuzz_targets/ir.rs"
+bench = false
 test = false
 doc = false
diff --git a/naga/src/back/hlsl/help.rs b/naga/src/back/hlsl/help.rs
index d3bb1ce7f..e6b0b3d61 100644
--- a/naga/src/back/hlsl/help.rs
+++ b/naga/src/back/hlsl/help.rs
@@ -1044,7 +1044,12 @@ impl<'a, W: Write> super::Writer<'a, W> {
                         crate::Expression::GlobalVariable(var_handle) => {
                             &module.global_variables[var_handle]
                         }
-                        ref other => unreachable!("Array length of base {:?}", other),
+                        ref other => {
+                            return Err(super::Error::Unimplemented(format!(
+                                "Array length of base {:?}",
+                                other
+                            )))
+                        }
                     };
                     let storage_access = match global_var.space {
                         crate::AddressSpace::Storage { access } => access,
diff --git a/wgpu-core/src/command/memory_init.rs b/wgpu-core/src/command/memory_init.rs
index 54bdedb79..338cdf8f2 100644
--- a/wgpu-core/src/command/memory_init.rs
+++ b/wgpu-core/src/command/memory_init.rs
@@ -172,6 +172,8 @@ impl<A: HalApi> BakedCommands<A> {
         device_tracker: &mut Tracker<A>,
         snatch_guard: &SnatchGuard<'_>,
     ) -> Result<(), DestroyedBufferError> {
+        profiling::scope!("initialize_buffer_memory");
+
         // Gather init ranges for each buffer so we can collapse them.
         // It is not possible to do this at an earlier point since previously
         // executed command buffer change the resource init state.
@@ -276,6 +278,8 @@ impl<A: HalApi> BakedCommands<A> {
         device: &Device<A>,
         snatch_guard: &SnatchGuard<'_>,
     ) -> Result<(), DestroyedTextureError> {
+        profiling::scope!("initialize_texture_memory");
+
         let mut ranges: Vec<TextureInitRange> = Vec::new();
         for texture_use in self.texture_memory_actions.drain_init_actions() {
             let mut initialization_status = texture_use.texture.initialization_status.write();
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index e9da11b7a..854ebfd76 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -32,7 +32,9 @@ pub const SHADER_STAGE_COUNT: usize = hal::MAX_CONCURRENT_SHADER_STAGES;
 // value is enough for a 16k texture with float4 format.
 pub(crate) const ZERO_BUFFER_SIZE: BufferAddress = 512 << 10;
 
-const CLEANUP_WAIT_MS: u32 = 5000;
+// If a submission is not completed within this time, we go off into UB land.
+// See https://github.com/gfx-rs/wgpu/issues/4589. 60s to reduce the chances of this.
+const CLEANUP_WAIT_MS: u32 = 60000;
 
 const IMPLICIT_BIND_GROUP_LAYOUT_ERROR_LABEL: &str = "Implicit BindGroupLayout in the Error State";
 const ENTRYPOINT_FAILURE_ERROR: &str = "The given EntryPoint is Invalid";
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index f7beff894..168b36843 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1186,6 +1186,8 @@ impl Global {
 
                     // finish all the command buffers first
                     for &cmb_id in command_buffer_ids {
+                        profiling::scope!("process command buffer");
+
                         // we reset the used surface textures every time we use
                         // it, so make sure to set_size on it.
                         used_surface_textures.set_size(device.tracker_indices.textures.size());
@@ -1222,59 +1224,73 @@ impl Global {
                             continue;
                         }
 
-                        // optimize the tracked states
-                        // cmdbuf.trackers.optimize();
                         {
+                            profiling::scope!("update submission ids");
+
                             let cmd_buf_data = cmdbuf.data.lock();
                             let cmd_buf_trackers = &cmd_buf_data.as_ref().unwrap().trackers;
 
                             // update submission IDs
-                            for buffer in cmd_buf_trackers.buffers.used_resources() {
-                                if buffer.raw.get(&snatch_guard).is_none() {
-                                    return Err(QueueSubmitError::DestroyedBuffer(
-                                        buffer.info.id(),
-                                    ));
-                                }
-                                buffer.info.use_at(submit_index);
-
-                                match *buffer.map_state.lock() {
-                                    BufferMapState::Idle => (),
-                                    _ => {
-                                        return Err(QueueSubmitError::BufferStillMapped(
+                            {
+                                profiling::scope!("buffers");
+                                for buffer in cmd_buf_trackers.buffers.used_resources() {
+                                    if buffer.raw.get(&snatch_guard).is_none() {
+                                        return Err(QueueSubmitError::DestroyedBuffer(
                                             buffer.info.id(),
-                                        ))
-                                    }
-                                }
-                            }
-                            for texture in cmd_buf_trackers.textures.used_resources() {
-                                let should_extend = match texture.inner.get(&snatch_guard) {
-                                    None => {
-                                        return Err(QueueSubmitError::DestroyedTexture(
-                                            texture.info.id(),
                                         ));
                                     }
-                                    Some(TextureInner::Native { .. }) => false,
-                                    Some(TextureInner::Surface { ref raw, .. }) => {
-                                        if raw.is_some() {
-                                            submit_surface_textures_owned.push(texture.clone());
-                                        }
+                                    buffer.info.use_at(submit_index);
 
-                                        true
+                                    match *buffer.map_state.lock() {
+                                        BufferMapState::Idle => (),
+                                        _ => {
+                                            return Err(QueueSubmitError::BufferStillMapped(
+                                                buffer.info.id(),
+                                            ))
+                                        }
                                     }
-                                };
-                                texture.info.use_at(submit_index);
-                                if should_extend {
-                                    unsafe {
-                                        used_surface_textures
-                                            .merge_single(&texture, None, hal::TextureUses::PRESENT)
-                                            .unwrap();
-                                    };
                                 }
                             }
-                            for texture_view in cmd_buf_trackers.views.used_resources() {
-                                texture_view.info.use_at(submit_index);
+                            {
+                                profiling::scope!("textures");
+                                for texture in cmd_buf_trackers.textures.used_resources() {
+                                    let should_extend = match texture.inner.get(&snatch_guard) {
+                                        None => {
+                                            return Err(QueueSubmitError::DestroyedTexture(
+                                                texture.info.id(),
+                                            ));
+                                        }
+                                        Some(TextureInner::Native { .. }) => false,
+                                        Some(TextureInner::Surface { ref raw, .. }) => {
+                                            if raw.is_some() {
+                                                submit_surface_textures_owned.push(texture.clone());
+                                            }
+
+                                            true
+                                        }
+                                    };
+                                    texture.info.use_at(submit_index);
+                                    if should_extend {
+                                        unsafe {
+                                            used_surface_textures
+                                                .merge_single(
+                                                    &texture,
+                                                    None,
+                                                    hal::TextureUses::PRESENT,
+                                                )
+                                                .unwrap();
+                                        };
+                                    }
+                                }
                             }
                             {
+                                profiling::scope!("views");
+                                for texture_view in cmd_buf_trackers.views.used_resources() {
+                                    texture_view.info.use_at(submit_index);
+                                }
+                            }
+                            {
+                                profiling::scope!("bind groups (+ referenced views/samplers)");
                                 for bg in cmd_buf_trackers.bind_groups.used_resources() {
                                     bg.info.use_at(submit_index);
                                     // We need to update the submission indices for the contained
@@ -1288,36 +1304,51 @@ impl Global {
                                     }
                                 }
                             }
-                            // assert!(cmd_buf_trackers.samplers.is_empty());
-                            for compute_pipeline in
-                                cmd_buf_trackers.compute_pipelines.used_resources()
                             {
-                                compute_pipeline.info.use_at(submit_index);
+                                profiling::scope!("compute pipelines");
+                                for compute_pipeline in
+                                    cmd_buf_trackers.compute_pipelines.used_resources()
+                                {
+                                    compute_pipeline.info.use_at(submit_index);
+                                }
                             }
-                            for render_pipeline in
-                                cmd_buf_trackers.render_pipelines.used_resources()
                             {
-                                render_pipeline.info.use_at(submit_index);
-                            }
-                            for query_set in cmd_buf_trackers.query_sets.used_resources() {
-                                query_set.info.use_at(submit_index);
-                            }
-                            for bundle in cmd_buf_trackers.bundles.used_resources() {
-                                bundle.info.use_at(submit_index);
-                                // We need to update the submission indices for the contained
-                                // state-less (!) resources as well, excluding the bind groups.
-                                // They don't get deleted too early if the bundle goes out of scope.
+                                profiling::scope!("render pipelines");
                                 for render_pipeline in
-                                    bundle.used.render_pipelines.read().used_resources()
+                                    cmd_buf_trackers.render_pipelines.used_resources()
                                 {
                                     render_pipeline.info.use_at(submit_index);
                                 }
-                                for query_set in bundle.used.query_sets.read().used_resources() {
+                            }
+                            {
+                                profiling::scope!("query sets");
+                                for query_set in cmd_buf_trackers.query_sets.used_resources() {
                                     query_set.info.use_at(submit_index);
                                 }
                             }
+                            {
+                                profiling::scope!(
+                                    "render bundles (+ referenced pipelines/query sets)"
+                                );
+                                for bundle in cmd_buf_trackers.bundles.used_resources() {
+                                    bundle.info.use_at(submit_index);
+                                    // We need to update the submission indices for the contained
+                                    // state-less (!) resources as well, excluding the bind groups.
+                                    // They don't get deleted too early if the bundle goes out of scope.
+                                    for render_pipeline in
+                                        bundle.used.render_pipelines.read().used_resources()
+                                    {
+                                        render_pipeline.info.use_at(submit_index);
+                                    }
+                                    for query_set in bundle.used.query_sets.read().used_resources()
+                                    {
+                                        query_set.info.use_at(submit_index);
+                                    }
+                                }
+                            }
                         }
                         let mut baked = cmdbuf.from_arc_into_baked();
+
                         // execute resource transitions
                         unsafe {
                             baked
@@ -1385,6 +1416,13 @@ impl Global {
                             raw: baked.encoder,
                             cmd_buffers: baked.list,
                         });
+
+                        {
+                            // This involves actually decrementing the ref count of all command buffer
+                            // resources, so can be _very_ expensive.
+                            profiling::scope!("drop command buffer trackers");
+                            drop(baked.trackers);
+                        }
                     }
 
                     log::trace!("Device after submission {}", submit_index);
diff --git a/wgpu/Cargo.toml b/wgpu/Cargo.toml
index 9d52f54d0..81927f0a6 100644
--- a/wgpu/Cargo.toml
+++ b/wgpu/Cargo.toml
@@ -84,9 +84,6 @@ naga-ir = ["dep:naga"]
 ## to the validation carried out at public APIs in all builds.
 strict_asserts = ["wgc?/strict_asserts", "wgt/strict_asserts"]
 
-## Log all API entry points at info instead of trace level.
-api_log_info = ["wgc/api_log_info"]
-
 ## Enables serialization via `serde` on common wgpu types.
 serde = ["dep:serde", "wgc/serde"]
 
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
index 3f6eb622b..f173fe969 100644
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@@ -13,11 +13,21 @@ Usage: xtask <COMMAND>
 
 Commands:
   run-wasm
+    Build and run web examples
+
     --release   Build in release mode
     --no-serve  Just build the generated files, don't serve them
+
   test
+    Run tests
+
     --llvm-cov  Run tests with LLVM code coverage using the llvm-cov tool
+    --list      List all of the tests and their executables without running them
+    --retries   Number of times to retry failing tests
+
   vendor-web-sys
+    Re-vendor the WebGPU web-sys bindings.
+
     --no-cleanup        Don't clean up temporary checkout of wasm-bindgen
     One of:
         --path-to-checkout  Path to a local checkout of wasm-bindgen to generate bindings from.
diff --git a/xtask/src/run_wasm.rs b/xtask/src/run_wasm.rs
index 33351e670..e575b0578 100644
--- a/xtask/src/run_wasm.rs
+++ b/xtask/src/run_wasm.rs
@@ -5,7 +5,7 @@ use xshell::Shell;
 
 use crate::util::{check_all_programs, Program};
 
-pub(crate) fn run_wasm(shell: Shell, mut args: Arguments) -> Result<(), anyhow::Error> {
+pub(crate) fn run_wasm(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
     let no_serve = args.contains("--no-serve");
     let release = args.contains("--release");
 
diff --git a/xtask/src/test.rs b/xtask/src/test.rs
index 70278df47..c5b378da1 100644
--- a/xtask/src/test.rs
+++ b/xtask/src/test.rs
@@ -4,6 +4,12 @@ use xshell::Shell;
 
 pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
     let llvm_cov = args.contains("--llvm-cov");
+    let list = args.contains("--list");
+    let retries = args
+        .opt_value_from_str("--retries")?
+        .unwrap_or(0_u32)
+        .to_string();
+
     // These needs to match the command in "run wgpu-info" in `.github/workflows/ci.yml`
     let llvm_cov_flags: &[_] = if llvm_cov {
         &["llvm-cov", "--no-cfg-coverage", "--no-report"]
@@ -13,18 +19,30 @@ pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
     let llvm_cov_nextest_flags: &[_] = if llvm_cov {
         &["llvm-cov", "--no-cfg-coverage", "--no-report", "nextest"]
     } else {
-        &["nextest", "run"]
+        if list {
+            &["nextest", "list"]
+        } else {
+            &["nextest", "run"]
+        }
     };
 
     log::info!("Generating .gpuconfig file based on gpus on the system");
 
-    xshell::cmd!(
-        shell,
-        "cargo {llvm_cov_flags...} run --bin wgpu-info -- --json -o .gpuconfig"
-    )
-    .quiet()
-    .run()
-    .context("Failed to run wgpu-info to generate .gpuconfig")?;
+    shell
+        .cmd("cargo")
+        .args(llvm_cov_flags)
+        .args([
+            "run",
+            "--bin",
+            "wgpu-info",
+            "--",
+            "--json",
+            "-o",
+            ".gpuconfig",
+        ])
+        .quiet()
+        .run()
+        .context("Failed to run wgpu-info to generate .gpuconfig")?;
 
     let gpu_count = shell
         .read_file(".gpuconfig")
@@ -39,16 +57,34 @@ pub fn run_tests(shell: Shell, mut args: Arguments) -> anyhow::Result<()> {
         if gpu_count == 1 { "" } else { "s" }
     );
 
+    if list {
+        log::info!("Listing tests");
+        shell
+            .cmd("cargo")
+            .args(llvm_cov_nextest_flags)
+            .args(["-v", "--benches", "--tests", "--all-features"])
+            .args(args.finish())
+            .run()
+            .context("Failed to list tests")?;
+        return Ok(());
+    }
     log::info!("Running cargo tests");
 
-    xshell::cmd!(
-        shell,
-        "cargo {llvm_cov_nextest_flags...} --all-features --no-fail-fast --retries 2"
-    )
-    .args(args.finish())
-    .quiet()
-    .run()
-    .context("Tests failed")?;
+    shell
+        .cmd("cargo")
+        .args(llvm_cov_nextest_flags)
+        .args([
+            "--benches",
+            "--tests",
+            "--no-fail-fast",
+            "--all-features",
+            "--retries",
+            &retries,
+        ])
+        .args(args.finish())
+        .quiet()
+        .run()
+        .context("Tests failed")?;
 
     log::info!("Finished tests");
 
diff --git a/xtask/src/util.rs b/xtask/src/util.rs
index 85f4444c4..186426971 100644
--- a/xtask/src/util.rs
+++ b/xtask/src/util.rs
@@ -1,15 +1,15 @@
 use std::{io, process::Command};
 
 pub(crate) struct Program {
-    pub binary_name: &'static str,
     pub crate_name: &'static str,
+    pub binary_name: &'static str,
 }
 
 pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
-    let mut failed = Vec::new();
-    for Program {
-        binary_name,
+    let mut failed_crates = Vec::new();
+    for &Program {
         crate_name,
+        binary_name,
     } in programs
     {
         let mut cmd = Command::new(binary_name);
@@ -21,7 +21,7 @@ pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
             }
             Err(e) if matches!(e.kind(), io::ErrorKind::NotFound) => {
                 log::error!("Checking for {binary_name} in PATH: ❌");
-                failed.push(*crate_name);
+                failed_crates.push(crate_name);
             }
             Err(e) => {
                 log::error!("Checking for {binary_name} in PATH: ❌");
@@ -30,12 +30,13 @@ pub(crate) fn check_all_programs(programs: &[Program]) -> anyhow::Result<()> {
         }
     }
 
-    if !failed.is_empty() {
+    if !failed_crates.is_empty() {
         log::error!(
             "Please install them with: cargo install {}",
-            failed.join(" ")
+            failed_crates.join(" ")
         );
-        anyhow::bail!("Missing programs in PATH");
+
+        anyhow::bail!("Missing required programs");
     }
 
     Ok(())