Compute pass benchmark (#5767)

Adds a benchmark for compute pass recording, very similar to what we have for render passes.
This commit is contained in:
Andreas Reich 2024-07-14 22:13:50 +02:00 committed by GitHub
parent 586215ab2e
commit d3edbc57a9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 652 additions and 3 deletions

View File

@ -183,6 +183,7 @@ By @teoxoy in [#5901](https://github.com/gfx-rs/wgpu/pull/5901)
- Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
- Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
- `wgpu::Error` is now `Sync`, making it possible to be wrapped in `anyhow::Error` or `eyre::Report`. By @nolanderc in [#5820](https://github.com/gfx-rs/wgpu/pull/5820)
- Added benchmark suite. By @cwfitzgerald in [#5694](https://github.com/gfx-rs/wgpu/pull/5694), compute passes by @wumpf in [#5767](https://github.com/gfx-rs/wgpu/pull/5767)
#### Metal
- Removed the `link` Cargo feature.

View File

@ -43,4 +43,4 @@ pollster.workspace = true
profiling.workspace = true
rayon.workspace = true
tracy-client = { workspace = true, optional = true }
wgpu.workspace = true
wgpu = { workspace = true, features = ["wgsl"] }

View File

@ -24,6 +24,21 @@ By default it measures 10k draw calls, with 90k total resources.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the render pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all textures at once instead of switching
the bind group for every draw call.
#### `Computepass`
This benchmark measures the performance of recording and submitting a compute pass with a large
number of dispatches and resources.
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the compute pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all resources at once instead of switching
the bind group for every draw call.
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
#### `Resource Creation`

View File

@ -0,0 +1,26 @@
@group(0) @binding(0)
var tex: binding_array<texture_2d<f32>>;
@group(0) @binding(1)
// TODO(https://github.com/gfx-rs/wgpu/issues/5765): The extra whitespace between the angle brackets is needed to workaround a parsing bug.
var images: binding_array<texture_storage_2d<r32float, read_write> >;
struct BufferElement {
element: vec4f,
}
@group(0) @binding(2)
var<storage, read_write> buffers: binding_array<BufferElement>;
@compute
@workgroup_size(16)
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
let offset = global_invocation_id.x; // Would be nice to offset this dynamically (it's just 0 always in the current setup)
let idx0 = offset * 2 + 0;
let idx1 = offset * 2 + 1;
let tex = textureLoad(tex[idx0], vec2u(0), 0) + textureLoad(tex[idx0], vec2u(0), 0);
let image = textureLoad(images[idx0], vec2u(0)) + textureLoad(images[idx1], vec2u(0));
buffers[idx0].element = tex.rrrr;
buffers[idx1].element = image.rrrr;
}

View File

@ -0,0 +1,574 @@
use std::{
num::{NonZeroU32, NonZeroU64},
time::{Duration, Instant},
};
use criterion::{criterion_group, Criterion, Throughput};
use nanorand::{Rng, WyRand};
use once_cell::sync::Lazy;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use crate::DeviceState;
#[cfg(not(test))]
const DISPATCH_COUNT: usize = 10_000;
#[cfg(test)]
const DISPATCH_COUNT: usize = 8; // Running with up to 8 threads.
// Currently bindless is _much_ slower than with regularly resources,
// since wgpu needs to issues barriers for all resources between each dispatch for all read/write textures & buffers.
// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
// resources as the regular benchmark.
// For details see https://github.com/gfx-rs/wgpu/issues/5766
#[cfg(not(test))]
const DISPATCH_COUNT_BINDLESS: usize = 1_000;
#[cfg(test)]
const DISPATCH_COUNT_BINDLESS: usize = 8; // Running with up to 8 threads.
// Must match the number of textures in the computepass.wgsl shader
const TEXTURES_PER_DISPATCH: usize = 2;
const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
const STORAGE_BUFFERS_PER_DISPATCH: usize = 2;
const TEXTURE_COUNT: usize = DISPATCH_COUNT * TEXTURES_PER_DISPATCH;
const STORAGE_TEXTURE_COUNT: usize = DISPATCH_COUNT * STORAGE_TEXTURES_PER_DISPATCH;
const STORAGE_BUFFER_COUNT: usize = DISPATCH_COUNT * STORAGE_BUFFERS_PER_DISPATCH;
const BUFFER_SIZE: u64 = 16;
struct ComputepassState {
device_state: DeviceState,
pipeline: wgpu::ComputePipeline,
bind_groups: Vec<wgpu::BindGroup>,
// Bindless resources
bindless_bind_group: Option<wgpu::BindGroup>,
bindless_pipeline: Option<wgpu::ComputePipeline>,
}
impl ComputepassState {
/// Create and prepare all the resources needed for the computepass benchmark.
fn new() -> Self {
let device_state = DeviceState::new();
let supports_bindless = device_state.device.features().contains(
wgpu::Features::BUFFER_BINDING_ARRAY
| wgpu::Features::TEXTURE_BINDING_ARRAY
| wgpu::Features::STORAGE_RESOURCE_BINDING_ARRAY
| wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
)
// TODO: as of writing llvmpipe segfaults the bindless benchmark on ci
&& device_state.adapter_info.driver != "llvmpipe";
// Performance gets considerably worse if the resources are shuffled.
//
// This more closely matches the real-world use case where resources have no
// well defined usage order.
let mut random = WyRand::new_seed(0x8BADF00D);
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
for i in 0..TEXTURES_PER_DISPATCH {
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: None,
});
}
for i in 0..STORAGE_TEXTURES_PER_DISPATCH {
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
binding: (TEXTURES_PER_DISPATCH + i) as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::StorageTexture {
access: wgpu::StorageTextureAccess::ReadWrite,
format: wgpu::TextureFormat::R32Float,
view_dimension: wgpu::TextureViewDimension::D2,
},
count: None,
});
}
for i in 0..STORAGE_BUFFERS_PER_DISPATCH {
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + i) as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: NonZeroU64::new(BUFFER_SIZE),
},
count: None,
});
}
let bind_group_layout =
device_state
.device
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &bind_group_layout_entries,
});
let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
for i in 0..TEXTURE_COUNT {
let texture = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("Texture {i}")),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsages::TEXTURE_BINDING,
view_formats: &[],
});
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
label: Some(&format!("Texture View {i}")),
..Default::default()
}));
}
random.shuffle(&mut texture_views);
let texture_view_refs: Vec<_> = texture_views.iter().collect();
let mut storage_texture_views = Vec::with_capacity(STORAGE_TEXTURE_COUNT);
for i in 0..TEXTURE_COUNT {
let texture = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("StorageTexture {i}")),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::R32Float,
usage: wgpu::TextureUsages::STORAGE_BINDING,
view_formats: &[],
});
storage_texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
label: Some(&format!("StorageTexture View {i}")),
..Default::default()
}));
}
random.shuffle(&mut storage_texture_views);
let storage_texture_view_refs: Vec<_> = storage_texture_views.iter().collect();
let mut storage_buffers = Vec::with_capacity(STORAGE_BUFFER_COUNT);
for i in 0..STORAGE_BUFFER_COUNT {
storage_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
label: Some(&format!("Buffer {i}")),
size: BUFFER_SIZE,
usage: wgpu::BufferUsages::STORAGE,
mapped_at_creation: false,
}));
}
random.shuffle(&mut storage_buffers);
let storage_buffer_bindings: Vec<_> = storage_buffers
.iter()
.map(|b| b.as_entire_buffer_binding())
.collect();
let mut bind_groups = Vec::with_capacity(DISPATCH_COUNT);
for dispatch_idx in 0..DISPATCH_COUNT {
let mut entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
for tex_idx in 0..TEXTURES_PER_DISPATCH {
entries.push(wgpu::BindGroupEntry {
binding: tex_idx as u32,
resource: wgpu::BindingResource::TextureView(
&texture_views[dispatch_idx * TEXTURES_PER_DISPATCH + tex_idx],
),
});
}
for tex_idx in 0..STORAGE_TEXTURES_PER_DISPATCH {
entries.push(wgpu::BindGroupEntry {
binding: (TEXTURES_PER_DISPATCH + tex_idx) as u32,
resource: wgpu::BindingResource::TextureView(
&storage_texture_views
[dispatch_idx * STORAGE_TEXTURES_PER_DISPATCH + tex_idx],
),
});
}
for buffer_idx in 0..STORAGE_BUFFERS_PER_DISPATCH {
entries.push(wgpu::BindGroupEntry {
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + buffer_idx)
as u32,
resource: wgpu::BindingResource::Buffer(
storage_buffers[dispatch_idx * STORAGE_BUFFERS_PER_DISPATCH + buffer_idx]
.as_entire_buffer_binding(),
),
});
}
bind_groups.push(
device_state
.device
.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &entries,
}),
);
}
random.shuffle(&mut bind_groups);
let sm = device_state
.device
.create_shader_module(wgpu::include_wgsl!("computepass.wgsl"));
let pipeline_layout =
device_state
.device
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
let pipeline =
device_state
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("Compute Pipeline"),
layout: Some(&pipeline_layout),
module: &sm,
entry_point: "cs_main",
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});
let (bindless_bind_group, bindless_pipeline) = if supports_bindless {
let bindless_bind_group_layout =
device_state
.device
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float {
filterable: true,
},
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
},
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::StorageTexture {
access: wgpu::StorageTextureAccess::ReadWrite,
format: wgpu::TextureFormat::R32Float,
view_dimension: wgpu::TextureViewDimension::D2,
},
count: Some(NonZeroU32::new(STORAGE_TEXTURE_COUNT as u32).unwrap()),
},
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: std::num::NonZeroU64::new(BUFFER_SIZE),
},
count: Some(NonZeroU32::new(STORAGE_BUFFER_COUNT as u32).unwrap()),
},
],
});
let bindless_bind_group =
device_state
.device
.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bindless_bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureViewArray(
&texture_view_refs[..DISPATCH_COUNT_BINDLESS],
),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::TextureViewArray(
&storage_texture_view_refs[..DISPATCH_COUNT_BINDLESS],
),
},
wgpu::BindGroupEntry {
binding: 2,
resource: wgpu::BindingResource::BufferArray(
&storage_buffer_bindings[..DISPATCH_COUNT_BINDLESS],
),
},
],
});
let bindless_sm = device_state
.device
.create_shader_module(wgpu::include_wgsl!("computepass-bindless.wgsl"));
let bindless_pipeline_layout =
device_state
.device
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bindless_bind_group_layout],
push_constant_ranges: &[],
});
let bindless_pipeline =
device_state
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("Compute Pipeline bindless"),
layout: Some(&bindless_pipeline_layout),
module: &bindless_sm,
entry_point: "cs_main",
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});
(Some(bindless_bind_group), Some(bindless_pipeline))
} else {
(None, None)
};
Self {
device_state,
pipeline,
bind_groups,
bindless_bind_group,
bindless_pipeline,
}
}
fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
let dispatch_per_pass = DISPATCH_COUNT / total_passes;
let mut encoder = self
.device_state
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: None,
timestamp_writes: None,
});
let start_idx = pass_number * dispatch_per_pass;
let end_idx = start_idx + dispatch_per_pass;
for dispatch_idx in start_idx..end_idx {
compute_pass.set_pipeline(&self.pipeline);
compute_pass.set_bind_group(0, &self.bind_groups[dispatch_idx], &[]);
compute_pass.dispatch_workgroups(1, 1, 1);
}
drop(compute_pass);
encoder.finish()
}
fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
profiling::scope!("Bindless Computepass");
let mut encoder = self
.device_state
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: None,
timestamp_writes: None,
});
compute_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
compute_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
for _ in 0..DISPATCH_COUNT_BINDLESS {
compute_pass.dispatch_workgroups(1, 1, 1);
}
drop(compute_pass);
encoder.finish()
}
}
fn run_bench(ctx: &mut Criterion) {
let state = Lazy::new(ComputepassState::new);
// Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
let mut group = ctx.benchmark_group("Computepass: Single Threaded");
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
for time_submit in [false, true] {
for cpasses in [1, 2, 4, 8] {
let dispatch_per_pass = DISPATCH_COUNT / cpasses;
let label = if time_submit {
"Submit Time"
} else {
"Computepass Time"
};
group.bench_function(
&format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
|b| {
Lazy::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let mut start = Instant::now();
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
for i in 0..cpasses {
buffers.push(state.run_subpass(i, cpasses));
}
if time_submit {
start = Instant::now();
} else {
duration += start.elapsed();
}
state.device_state.queue.submit(buffers);
if time_submit {
duration += start.elapsed();
}
state.device_state.device.poll(wgpu::Maintain::Wait);
}
duration
})
},
);
}
}
group.finish();
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
for threads in [2, 4, 8] {
let dispatch_per_pass = DISPATCH_COUNT / threads;
group.bench_function(
&format!("{threads} threads x {dispatch_per_pass} dispatch"),
|b| {
Lazy::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(i, threads))
.collect::<Vec<_>>();
duration += start.elapsed();
state.device_state.queue.submit(buffers);
state.device_state.device.poll(wgpu::Maintain::Wait);
}
duration
})
},
);
}
group.finish();
// Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Computepass: Bindless");
group.throughput(Throughput::Elements(DISPATCH_COUNT_BINDLESS as _));
group.bench_function(&format!("{DISPATCH_COUNT_BINDLESS} dispatch"), |b| {
Lazy::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
// Need bindless to run this benchmark
if state.bindless_bind_group.is_none() {
return Duration::from_secs_f32(1.0);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffer = state.run_bindless_pass();
duration += start.elapsed();
state.device_state.queue.submit([buffer]);
state.device_state.device.poll(wgpu::Maintain::Wait);
}
duration
})
});
group.finish();
ctx.bench_function(
&format!(
"Computepass: Empty Submit with {} Resources",
TEXTURE_COUNT + STORAGE_TEXTURE_COUNT + STORAGE_BUFFER_COUNT
),
|b| {
Lazy::force(&state);
b.iter(|| state.device_state.queue.submit([]));
},
);
}
criterion_group! {
name = computepass;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
}

View File

@ -0,0 +1,26 @@
@group(0) @binding(0)
var tex_0: texture_2d<f32>;
@group(0) @binding(1)
var tex_1: texture_2d<f32>;
@group(0) @binding(2)
var image_0: texture_storage_2d<r32float, read_write>;
@group(0) @binding(3)
var image_1: texture_storage_2d<r32float, read_write>;
@group(0) @binding(4)
var<storage, read_write> buffer0 : array<vec4f>;
@group(0) @binding(5)
var<storage, read_write> buffer1 : array<vec4f>;
@compute
@workgroup_size(16)
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
let tex = textureLoad(tex_0, vec2u(0), 0) + textureLoad(tex_1, vec2u(0), 0);
let image = textureLoad(image_0, vec2u(0)) + textureLoad(image_1, vec2u(0));
buffer0[0] = tex.rrrr;
buffer1[0] = image.rrrr;
}

View File

@ -10,7 +10,12 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
use crate::DeviceState;
#[cfg(test)]
const DRAW_COUNT: usize = 8; // Running with up to 8 threads.
#[cfg(not(test))]
const DRAW_COUNT: usize = 10_000;
// Must match the number of textures in the renderpass.wgsl shader
const TEXTURES_PER_DRAW: usize = 7;
const VERTEX_BUFFERS_PER_DRAW: usize = 2;

View File

@ -1,6 +1,7 @@
use criterion::criterion_main;
use pollster::block_on;
mod computepass;
mod renderpass;
mod resource_creation;
mod shader;
@ -45,7 +46,7 @@ impl DeviceState {
required_features: adapter.features(),
required_limits: adapter.limits(),
memory_hints: wgpu::MemoryHints::Performance,
label: Some("RenderPass Device"),
label: Some("Compute/RenderPass Device"),
},
None,
))
@ -61,6 +62,7 @@ impl DeviceState {
criterion_main!(
renderpass::renderpass,
computepass::computepass,
resource_creation::resource_creation,
shader::shader
);

View File

@ -66,7 +66,7 @@ pub enum CreateBindGroupLayoutError {
},
#[error(transparent)]
TooManyBindings(BindingTypeMaxCountError),
#[error("Binding index {binding} is greater than the maximum index {maximum}")]
#[error("Binding index {binding} is greater than the maximum number {maximum}")]
InvalidBindingIndex { binding: u32, maximum: u32 },
#[error("Invalid visibility {0:?}")]
InvalidVisibility(wgt::ShaderStages),