Compute pass benchmark (#5767)

Adds a benchmark for compute pass recording, very similar to what we have for render passes.
2024-11-21 14:23:32 +00:00 · 2024-07-14 22:13:50 +02:00 · 2024-07-14 22:13:50 +02:00 · d3edbc57a9
commit d3edbc57a9
parent 586215ab2e
9 changed files with 652 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -183,6 +183,7 @@ By @teoxoy in [#5901](https://github.com/gfx-rs/wgpu/pull/5901)
 - Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
 - Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
 - `wgpu::Error` is now `Sync`, making it possible to be wrapped in `anyhow::Error` or `eyre::Report`. By @nolanderc in [#5820](https://github.com/gfx-rs/wgpu/pull/5820)
+- Added benchmark suite. By @cwfitzgerald in [#5694](https://github.com/gfx-rs/wgpu/pull/5694), compute passes by @wumpf in [#5767](https://github.com/gfx-rs/wgpu/pull/5767)

 #### Metal
 - Removed the `link` Cargo feature.
--- a/benches/Cargo.toml
+++ b/benches/Cargo.toml
@ -43,4 +43,4 @@ pollster.workspace = true
 profiling.workspace = true
 rayon.workspace = true
 tracy-client = { workspace = true, optional = true }
-wgpu.workspace = true
+wgpu = { workspace = true, features = ["wgsl"] }
--- a/benches/README.md
+++ b/benches/README.md
@ -24,6 +24,21 @@ By default it measures 10k draw calls, with 90k total resources.

 Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
 the render pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all textures at once instead of switching
+the bind group for every draw call.
+
+#### `Computepass`
+
+This benchmark measures the performance of recording and submitting a compute pass with a large
+number of dispatches and resources.
+By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the compute pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all resources at once instead of switching
+the bind group for every draw call.
+TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
+

 #### `Resource Creation`

--- a/benches/benches/computepass-bindless.wgsl
+++ b/benches/benches/computepass-bindless.wgsl
@ -0,0 +1,26 @@
+@group(0) @binding(0)
+var tex: binding_array<texture_2d<f32>>;
+
+@group(0) @binding(1)
+// TODO(https://github.com/gfx-rs/wgpu/issues/5765): The extra whitespace between the angle brackets is needed to workaround a parsing bug.
+var images: binding_array<texture_storage_2d<r32float, read_write> >;
+struct BufferElement {
+    element: vec4f,
+}
+
+@group(0) @binding(2)
+var<storage, read_write> buffers: binding_array<BufferElement>;
+
+@compute
+@workgroup_size(16)
+fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    let offset = global_invocation_id.x; // Would be nice to offset this dynamically (it's just 0 always in the current setup)
+    
+    let idx0 = offset * 2 + 0;
+    let idx1 = offset * 2 + 1;
+    
+    let tex = textureLoad(tex[idx0], vec2u(0), 0) + textureLoad(tex[idx0], vec2u(0), 0);
+    let image = textureLoad(images[idx0], vec2u(0)) + textureLoad(images[idx1], vec2u(0));
+    buffers[idx0].element = tex.rrrr;
+    buffers[idx1].element = image.rrrr;
+}
--- a/benches/benches/computepass.rs
+++ b/benches/benches/computepass.rs
@ -0,0 +1,574 @@
+use std::{
+    num::{NonZeroU32, NonZeroU64},
+    time::{Duration, Instant},
+};
+
+use criterion::{criterion_group, Criterion, Throughput};
+use nanorand::{Rng, WyRand};
+use once_cell::sync::Lazy;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::DeviceState;
+
+#[cfg(not(test))]
+const DISPATCH_COUNT: usize = 10_000;
+#[cfg(test)]
+const DISPATCH_COUNT: usize = 8; // Running with up to 8 threads.
+
+// Currently bindless is _much_ slower than with regularly resources,
+// since wgpu needs to issues barriers for all resources between each dispatch for all read/write textures & buffers.
+// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
+// resources as the regular benchmark.
+// For details see https://github.com/gfx-rs/wgpu/issues/5766
+#[cfg(not(test))]
+const DISPATCH_COUNT_BINDLESS: usize = 1_000;
+#[cfg(test)]
+const DISPATCH_COUNT_BINDLESS: usize = 8; // Running with up to 8 threads.
+
+// Must match the number of textures in the computepass.wgsl shader
+const TEXTURES_PER_DISPATCH: usize = 2;
+const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
+const STORAGE_BUFFERS_PER_DISPATCH: usize = 2;
+
+const TEXTURE_COUNT: usize = DISPATCH_COUNT * TEXTURES_PER_DISPATCH;
+const STORAGE_TEXTURE_COUNT: usize = DISPATCH_COUNT * STORAGE_TEXTURES_PER_DISPATCH;
+const STORAGE_BUFFER_COUNT: usize = DISPATCH_COUNT * STORAGE_BUFFERS_PER_DISPATCH;
+
+const BUFFER_SIZE: u64 = 16;
+
+struct ComputepassState {
+    device_state: DeviceState,
+    pipeline: wgpu::ComputePipeline,
+    bind_groups: Vec<wgpu::BindGroup>,
+
+    // Bindless resources
+    bindless_bind_group: Option<wgpu::BindGroup>,
+    bindless_pipeline: Option<wgpu::ComputePipeline>,
+}
+
+impl ComputepassState {
+    /// Create and prepare all the resources needed for the computepass benchmark.
+    fn new() -> Self {
+        let device_state = DeviceState::new();
+
+        let supports_bindless = device_state.device.features().contains(
+            wgpu::Features::BUFFER_BINDING_ARRAY
+                | wgpu::Features::TEXTURE_BINDING_ARRAY
+                | wgpu::Features::STORAGE_RESOURCE_BINDING_ARRAY
+                | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
+        )
+        // TODO: as of writing llvmpipe segfaults the bindless benchmark on ci
+        && device_state.adapter_info.driver != "llvmpipe";
+
+        // Performance gets considerably worse if the resources are shuffled.
+        //
+        // This more closely matches the real-world use case where resources have no
+        // well defined usage order.
+        let mut random = WyRand::new_seed(0x8BADF00D);
+
+        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
+        for i in 0..TEXTURES_PER_DISPATCH {
+            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
+                binding: i as u32,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Texture {
+                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                    view_dimension: wgpu::TextureViewDimension::D2,
+                    multisampled: false,
+                },
+                count: None,
+            });
+        }
+        for i in 0..STORAGE_TEXTURES_PER_DISPATCH {
+            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
+                binding: (TEXTURES_PER_DISPATCH + i) as u32,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::StorageTexture {
+                    access: wgpu::StorageTextureAccess::ReadWrite,
+                    format: wgpu::TextureFormat::R32Float,
+                    view_dimension: wgpu::TextureViewDimension::D2,
+                },
+                count: None,
+            });
+        }
+        for i in 0..STORAGE_BUFFERS_PER_DISPATCH {
+            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
+                binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + i) as u32,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: NonZeroU64::new(BUFFER_SIZE),
+                },
+                count: None,
+            });
+        }
+
+        let bind_group_layout =
+            device_state
+                .device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: None,
+                    entries: &bind_group_layout_entries,
+                });
+
+        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
+        for i in 0..TEXTURE_COUNT {
+            let texture = device_state
+                .device
+                .create_texture(&wgpu::TextureDescriptor {
+                    label: Some(&format!("Texture {i}")),
+                    size: wgpu::Extent3d {
+                        width: 1,
+                        height: 1,
+                        depth_or_array_layers: 1,
+                    },
+                    mip_level_count: 1,
+                    sample_count: 1,
+                    dimension: wgpu::TextureDimension::D2,
+                    format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                    usage: wgpu::TextureUsages::TEXTURE_BINDING,
+                    view_formats: &[],
+                });
+            texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
+                label: Some(&format!("Texture View {i}")),
+                ..Default::default()
+            }));
+        }
+        random.shuffle(&mut texture_views);
+        let texture_view_refs: Vec<_> = texture_views.iter().collect();
+
+        let mut storage_texture_views = Vec::with_capacity(STORAGE_TEXTURE_COUNT);
+        for i in 0..TEXTURE_COUNT {
+            let texture = device_state
+                .device
+                .create_texture(&wgpu::TextureDescriptor {
+                    label: Some(&format!("StorageTexture {i}")),
+                    size: wgpu::Extent3d {
+                        width: 1,
+                        height: 1,
+                        depth_or_array_layers: 1,
+                    },
+                    mip_level_count: 1,
+                    sample_count: 1,
+                    dimension: wgpu::TextureDimension::D2,
+                    format: wgpu::TextureFormat::R32Float,
+                    usage: wgpu::TextureUsages::STORAGE_BINDING,
+                    view_formats: &[],
+                });
+            storage_texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
+                label: Some(&format!("StorageTexture View {i}")),
+                ..Default::default()
+            }));
+        }
+        random.shuffle(&mut storage_texture_views);
+        let storage_texture_view_refs: Vec<_> = storage_texture_views.iter().collect();
+
+        let mut storage_buffers = Vec::with_capacity(STORAGE_BUFFER_COUNT);
+        for i in 0..STORAGE_BUFFER_COUNT {
+            storage_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(&format!("Buffer {i}")),
+                size: BUFFER_SIZE,
+                usage: wgpu::BufferUsages::STORAGE,
+                mapped_at_creation: false,
+            }));
+        }
+        random.shuffle(&mut storage_buffers);
+        let storage_buffer_bindings: Vec<_> = storage_buffers
+            .iter()
+            .map(|b| b.as_entire_buffer_binding())
+            .collect();
+
+        let mut bind_groups = Vec::with_capacity(DISPATCH_COUNT);
+        for dispatch_idx in 0..DISPATCH_COUNT {
+            let mut entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
+            for tex_idx in 0..TEXTURES_PER_DISPATCH {
+                entries.push(wgpu::BindGroupEntry {
+                    binding: tex_idx as u32,
+                    resource: wgpu::BindingResource::TextureView(
+                        &texture_views[dispatch_idx * TEXTURES_PER_DISPATCH + tex_idx],
+                    ),
+                });
+            }
+            for tex_idx in 0..STORAGE_TEXTURES_PER_DISPATCH {
+                entries.push(wgpu::BindGroupEntry {
+                    binding: (TEXTURES_PER_DISPATCH + tex_idx) as u32,
+                    resource: wgpu::BindingResource::TextureView(
+                        &storage_texture_views
+                            [dispatch_idx * STORAGE_TEXTURES_PER_DISPATCH + tex_idx],
+                    ),
+                });
+            }
+            for buffer_idx in 0..STORAGE_BUFFERS_PER_DISPATCH {
+                entries.push(wgpu::BindGroupEntry {
+                    binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + buffer_idx)
+                        as u32,
+                    resource: wgpu::BindingResource::Buffer(
+                        storage_buffers[dispatch_idx * STORAGE_BUFFERS_PER_DISPATCH + buffer_idx]
+                            .as_entire_buffer_binding(),
+                    ),
+                });
+            }
+
+            bind_groups.push(
+                device_state
+                    .device
+                    .create_bind_group(&wgpu::BindGroupDescriptor {
+                        label: None,
+                        layout: &bind_group_layout,
+                        entries: &entries,
+                    }),
+            );
+        }
+        random.shuffle(&mut bind_groups);
+
+        let sm = device_state
+            .device
+            .create_shader_module(wgpu::include_wgsl!("computepass.wgsl"));
+
+        let pipeline_layout =
+            device_state
+                .device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: None,
+                    bind_group_layouts: &[&bind_group_layout],
+                    push_constant_ranges: &[],
+                });
+
+        let pipeline =
+            device_state
+                .device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: Some("Compute Pipeline"),
+                    layout: Some(&pipeline_layout),
+                    module: &sm,
+                    entry_point: "cs_main",
+                    compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    cache: None,
+                });
+
+        let (bindless_bind_group, bindless_pipeline) = if supports_bindless {
+            let bindless_bind_group_layout =
+                device_state
+                    .device
+                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                        label: None,
+                        entries: &[
+                            wgpu::BindGroupLayoutEntry {
+                                binding: 0,
+                                visibility: wgpu::ShaderStages::COMPUTE,
+                                ty: wgpu::BindingType::Texture {
+                                    sample_type: wgpu::TextureSampleType::Float {
+                                        filterable: true,
+                                    },
+                                    view_dimension: wgpu::TextureViewDimension::D2,
+                                    multisampled: false,
+                                },
+                                count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
+                            },
+                            wgpu::BindGroupLayoutEntry {
+                                binding: 1,
+                                visibility: wgpu::ShaderStages::COMPUTE,
+                                ty: wgpu::BindingType::StorageTexture {
+                                    access: wgpu::StorageTextureAccess::ReadWrite,
+                                    format: wgpu::TextureFormat::R32Float,
+                                    view_dimension: wgpu::TextureViewDimension::D2,
+                                },
+                                count: Some(NonZeroU32::new(STORAGE_TEXTURE_COUNT as u32).unwrap()),
+                            },
+                            wgpu::BindGroupLayoutEntry {
+                                binding: 2,
+                                visibility: wgpu::ShaderStages::COMPUTE,
+                                ty: wgpu::BindingType::Buffer {
+                                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                    has_dynamic_offset: false,
+                                    min_binding_size: std::num::NonZeroU64::new(BUFFER_SIZE),
+                                },
+                                count: Some(NonZeroU32::new(STORAGE_BUFFER_COUNT as u32).unwrap()),
+                            },
+                        ],
+                    });
+
+            let bindless_bind_group =
+                device_state
+                    .device
+                    .create_bind_group(&wgpu::BindGroupDescriptor {
+                        label: None,
+                        layout: &bindless_bind_group_layout,
+                        entries: &[
+                            wgpu::BindGroupEntry {
+                                binding: 0,
+                                resource: wgpu::BindingResource::TextureViewArray(
+                                    &texture_view_refs[..DISPATCH_COUNT_BINDLESS],
+                                ),
+                            },
+                            wgpu::BindGroupEntry {
+                                binding: 1,
+                                resource: wgpu::BindingResource::TextureViewArray(
+                                    &storage_texture_view_refs[..DISPATCH_COUNT_BINDLESS],
+                                ),
+                            },
+                            wgpu::BindGroupEntry {
+                                binding: 2,
+                                resource: wgpu::BindingResource::BufferArray(
+                                    &storage_buffer_bindings[..DISPATCH_COUNT_BINDLESS],
+                                ),
+                            },
+                        ],
+                    });
+
+            let bindless_sm = device_state
+                .device
+                .create_shader_module(wgpu::include_wgsl!("computepass-bindless.wgsl"));
+
+            let bindless_pipeline_layout =
+                device_state
+                    .device
+                    .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                        label: None,
+                        bind_group_layouts: &[&bindless_bind_group_layout],
+                        push_constant_ranges: &[],
+                    });
+
+            let bindless_pipeline =
+                device_state
+                    .device
+                    .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                        label: Some("Compute Pipeline bindless"),
+                        layout: Some(&bindless_pipeline_layout),
+                        module: &bindless_sm,
+                        entry_point: "cs_main",
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                        cache: None,
+                    });
+
+            (Some(bindless_bind_group), Some(bindless_pipeline))
+        } else {
+            (None, None)
+        };
+
+        Self {
+            device_state,
+            pipeline,
+            bind_groups,
+
+            bindless_bind_group,
+            bindless_pipeline,
+        }
+    }
+
+    fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
+        profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
+
+        let dispatch_per_pass = DISPATCH_COUNT / total_passes;
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+            label: None,
+            timestamp_writes: None,
+        });
+
+        let start_idx = pass_number * dispatch_per_pass;
+        let end_idx = start_idx + dispatch_per_pass;
+        for dispatch_idx in start_idx..end_idx {
+            compute_pass.set_pipeline(&self.pipeline);
+            compute_pass.set_bind_group(0, &self.bind_groups[dispatch_idx], &[]);
+            compute_pass.dispatch_workgroups(1, 1, 1);
+        }
+
+        drop(compute_pass);
+
+        encoder.finish()
+    }
+
+    fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
+        profiling::scope!("Bindless Computepass");
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+            label: None,
+            timestamp_writes: None,
+        });
+
+        compute_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
+        compute_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
+        for _ in 0..DISPATCH_COUNT_BINDLESS {
+            compute_pass.dispatch_workgroups(1, 1, 1);
+        }
+
+        drop(compute_pass);
+
+        encoder.finish()
+    }
+}
+
+fn run_bench(ctx: &mut Criterion) {
+    let state = Lazy::new(ComputepassState::new);
+
+    // Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
+    let mut group = ctx.benchmark_group("Computepass: Single Threaded");
+    group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
+
+    for time_submit in [false, true] {
+        for cpasses in [1, 2, 4, 8] {
+            let dispatch_per_pass = DISPATCH_COUNT / cpasses;
+
+            let label = if time_submit {
+                "Submit Time"
+            } else {
+                "Computepass Time"
+            };
+
+            group.bench_function(
+                &format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
+                |b| {
+                    Lazy::force(&state);
+
+                    b.iter_custom(|iters| {
+                        profiling::scope!("benchmark invocation");
+
+                        let mut duration = Duration::ZERO;
+
+                        for _ in 0..iters {
+                            profiling::scope!("benchmark iteration");
+
+                            let mut start = Instant::now();
+
+                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
+                            for i in 0..cpasses {
+                                buffers.push(state.run_subpass(i, cpasses));
+                            }
+
+                            if time_submit {
+                                start = Instant::now();
+                            } else {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.queue.submit(buffers);
+
+                            if time_submit {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.device.poll(wgpu::Maintain::Wait);
+                        }
+
+                        duration
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+
+    // Test 10k dispatch calls split up over 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
+    group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
+
+    for threads in [2, 4, 8] {
+        let dispatch_per_pass = DISPATCH_COUNT / threads;
+        group.bench_function(
+            &format!("{threads} threads x {dispatch_per_pass} dispatch"),
+            |b| {
+                Lazy::force(&state);
+
+                b.iter_custom(|iters| {
+                    profiling::scope!("benchmark invocation");
+
+                    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+                    if state.device_state.adapter_info.name.contains("Paravirtual") {
+                        return Duration::from_secs_f32(1.0);
+                    }
+
+                    let mut duration = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        profiling::scope!("benchmark iteration");
+
+                        let start = Instant::now();
+
+                        let buffers = (0..threads)
+                            .into_par_iter()
+                            .map(|i| state.run_subpass(i, threads))
+                            .collect::<Vec<_>>();
+
+                        duration += start.elapsed();
+
+                        state.device_state.queue.submit(buffers);
+                        state.device_state.device.poll(wgpu::Maintain::Wait);
+                    }
+
+                    duration
+                })
+            },
+        );
+    }
+    group.finish();
+
+    // Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Computepass: Bindless");
+    group.throughput(Throughput::Elements(DISPATCH_COUNT_BINDLESS as _));
+
+    group.bench_function(&format!("{DISPATCH_COUNT_BINDLESS} dispatch"), |b| {
+        Lazy::force(&state);
+
+        b.iter_custom(|iters| {
+            profiling::scope!("benchmark invocation");
+
+            // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+            if state.device_state.adapter_info.name.contains("Paravirtual") {
+                return Duration::from_secs_f32(1.0);
+            }
+
+            // Need bindless to run this benchmark
+            if state.bindless_bind_group.is_none() {
+                return Duration::from_secs_f32(1.0);
+            }
+
+            let mut duration = Duration::ZERO;
+
+            for _ in 0..iters {
+                profiling::scope!("benchmark iteration");
+
+                let start = Instant::now();
+
+                let buffer = state.run_bindless_pass();
+
+                duration += start.elapsed();
+
+                state.device_state.queue.submit([buffer]);
+                state.device_state.device.poll(wgpu::Maintain::Wait);
+            }
+
+            duration
+        })
+    });
+    group.finish();
+
+    ctx.bench_function(
+        &format!(
+            "Computepass: Empty Submit with {} Resources",
+            TEXTURE_COUNT + STORAGE_TEXTURE_COUNT + STORAGE_BUFFER_COUNT
+        ),
+        |b| {
+            Lazy::force(&state);
+
+            b.iter(|| state.device_state.queue.submit([]));
+        },
+    );
+}
+
+criterion_group! {
+    name = computepass;
+    config = Criterion::default().measurement_time(Duration::from_secs(10));
+    targets = run_bench,
+}
--- a/benches/benches/computepass.wgsl
+++ b/benches/benches/computepass.wgsl
@ -0,0 +1,26 @@
+@group(0) @binding(0)
+var tex_0: texture_2d<f32>;
+
+@group(0) @binding(1)
+var tex_1: texture_2d<f32>;
+
+@group(0) @binding(2)
+var image_0: texture_storage_2d<r32float, read_write>;
+
+@group(0) @binding(3)
+var image_1: texture_storage_2d<r32float, read_write>;
+
+@group(0) @binding(4)
+var<storage, read_write> buffer0 : array<vec4f>;
+
+@group(0) @binding(5)
+var<storage, read_write> buffer1 : array<vec4f>;
+
+@compute
+@workgroup_size(16)
+fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    let tex = textureLoad(tex_0, vec2u(0), 0) + textureLoad(tex_1, vec2u(0), 0);
+    let image = textureLoad(image_0, vec2u(0)) + textureLoad(image_1, vec2u(0));
+    buffer0[0] = tex.rrrr;
+    buffer1[0] = image.rrrr;
+}
--- a/benches/benches/renderpass.rs
+++ b/benches/benches/renderpass.rs
@ -10,7 +10,12 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};

 use crate::DeviceState;

+#[cfg(test)]
+const DRAW_COUNT: usize = 8; // Running with up to 8 threads.
+
+#[cfg(not(test))]
 const DRAW_COUNT: usize = 10_000;
+
 // Must match the number of textures in the renderpass.wgsl shader
 const TEXTURES_PER_DRAW: usize = 7;
 const VERTEX_BUFFERS_PER_DRAW: usize = 2;
--- a/benches/benches/root.rs
+++ b/benches/benches/root.rs
@ -1,6 +1,7 @@
 use criterion::criterion_main;
 use pollster::block_on;

+mod computepass;
 mod renderpass;
 mod resource_creation;
 mod shader;
@ -45,7 +46,7 @@ impl DeviceState {
                required_features: adapter.features(),
                required_limits: adapter.limits(),
                memory_hints: wgpu::MemoryHints::Performance,
-                label: Some("RenderPass Device"),
+                label: Some("Compute/RenderPass Device"),
            },
            None,
        ))
@ -61,6 +62,7 @@ impl DeviceState {

 criterion_main!(
    renderpass::renderpass,
+    computepass::computepass,
    resource_creation::resource_creation,
    shader::shader
 );
--- a/wgpu-core/src/binding_model.rs
+++ b/wgpu-core/src/binding_model.rs
@ -66,7 +66,7 @@ pub enum CreateBindGroupLayoutError {
    },
    #[error(transparent)]
    TooManyBindings(BindingTypeMaxCountError),
-    #[error("Binding index {binding} is greater than the maximum index {maximum}")]
+    #[error("Binding index {binding} is greater than the maximum number {maximum}")]
    InvalidBindingIndex { binding: u32, maximum: u32 },
    #[error("Invalid visibility {0:?}")]
    InvalidVisibility(wgt::ShaderStages),