mirror of
https://github.com/gfx-rs/wgpu.git
synced 2024-11-21 14:23:32 +00:00
Compute pass benchmark (#5767)
Adds a benchmark for compute pass recording, very similar to what we have for render passes.
This commit is contained in:
parent
586215ab2e
commit
d3edbc57a9
@ -183,6 +183,7 @@ By @teoxoy in [#5901](https://github.com/gfx-rs/wgpu/pull/5901)
|
||||
- Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
|
||||
- Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
|
||||
- `wgpu::Error` is now `Sync`, making it possible to be wrapped in `anyhow::Error` or `eyre::Report`. By @nolanderc in [#5820](https://github.com/gfx-rs/wgpu/pull/5820)
|
||||
- Added benchmark suite. By @cwfitzgerald in [#5694](https://github.com/gfx-rs/wgpu/pull/5694), compute passes by @wumpf in [#5767](https://github.com/gfx-rs/wgpu/pull/5767)
|
||||
|
||||
#### Metal
|
||||
- Removed the `link` Cargo feature.
|
||||
|
@ -43,4 +43,4 @@ pollster.workspace = true
|
||||
profiling.workspace = true
|
||||
rayon.workspace = true
|
||||
tracy-client = { workspace = true, optional = true }
|
||||
wgpu.workspace = true
|
||||
wgpu = { workspace = true, features = ["wgsl"] }
|
||||
|
@ -24,6 +24,21 @@ By default it measures 10k draw calls, with 90k total resources.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the render pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all textures at once instead of switching
|
||||
the bind group for every draw call.
|
||||
|
||||
#### `Computepass`
|
||||
|
||||
This benchmark measures the performance of recording and submitting a compute pass with a large
|
||||
number of dispatches and resources.
|
||||
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the compute pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all resources at once instead of switching
|
||||
the bind group for every draw call.
|
||||
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
|
||||
|
||||
|
||||
#### `Resource Creation`
|
||||
|
||||
|
26
benches/benches/computepass-bindless.wgsl
Normal file
26
benches/benches/computepass-bindless.wgsl
Normal file
@ -0,0 +1,26 @@
|
||||
@group(0) @binding(0)
|
||||
var tex: binding_array<texture_2d<f32>>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
// TODO(https://github.com/gfx-rs/wgpu/issues/5765): The extra whitespace between the angle brackets is needed to workaround a parsing bug.
|
||||
var images: binding_array<texture_storage_2d<r32float, read_write> >;
|
||||
struct BufferElement {
|
||||
element: vec4f,
|
||||
}
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<storage, read_write> buffers: binding_array<BufferElement>;
|
||||
|
||||
@compute
|
||||
@workgroup_size(16)
|
||||
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
|
||||
let offset = global_invocation_id.x; // Would be nice to offset this dynamically (it's just 0 always in the current setup)
|
||||
|
||||
let idx0 = offset * 2 + 0;
|
||||
let idx1 = offset * 2 + 1;
|
||||
|
||||
let tex = textureLoad(tex[idx0], vec2u(0), 0) + textureLoad(tex[idx0], vec2u(0), 0);
|
||||
let image = textureLoad(images[idx0], vec2u(0)) + textureLoad(images[idx1], vec2u(0));
|
||||
buffers[idx0].element = tex.rrrr;
|
||||
buffers[idx1].element = image.rrrr;
|
||||
}
|
574
benches/benches/computepass.rs
Normal file
574
benches/benches/computepass.rs
Normal file
@ -0,0 +1,574 @@
|
||||
use std::{
|
||||
num::{NonZeroU32, NonZeroU64},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use nanorand::{Rng, WyRand};
|
||||
use once_cell::sync::Lazy;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
|
||||
use crate::DeviceState;
|
||||
|
||||
#[cfg(not(test))]
|
||||
const DISPATCH_COUNT: usize = 10_000;
|
||||
#[cfg(test)]
|
||||
const DISPATCH_COUNT: usize = 8; // Running with up to 8 threads.
|
||||
|
||||
// Currently bindless is _much_ slower than with regularly resources,
|
||||
// since wgpu needs to issues barriers for all resources between each dispatch for all read/write textures & buffers.
|
||||
// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
|
||||
// resources as the regular benchmark.
|
||||
// For details see https://github.com/gfx-rs/wgpu/issues/5766
|
||||
#[cfg(not(test))]
|
||||
const DISPATCH_COUNT_BINDLESS: usize = 1_000;
|
||||
#[cfg(test)]
|
||||
const DISPATCH_COUNT_BINDLESS: usize = 8; // Running with up to 8 threads.
|
||||
|
||||
// Must match the number of textures in the computepass.wgsl shader
|
||||
const TEXTURES_PER_DISPATCH: usize = 2;
|
||||
const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
|
||||
const STORAGE_BUFFERS_PER_DISPATCH: usize = 2;
|
||||
|
||||
const TEXTURE_COUNT: usize = DISPATCH_COUNT * TEXTURES_PER_DISPATCH;
|
||||
const STORAGE_TEXTURE_COUNT: usize = DISPATCH_COUNT * STORAGE_TEXTURES_PER_DISPATCH;
|
||||
const STORAGE_BUFFER_COUNT: usize = DISPATCH_COUNT * STORAGE_BUFFERS_PER_DISPATCH;
|
||||
|
||||
const BUFFER_SIZE: u64 = 16;
|
||||
|
||||
struct ComputepassState {
|
||||
device_state: DeviceState,
|
||||
pipeline: wgpu::ComputePipeline,
|
||||
bind_groups: Vec<wgpu::BindGroup>,
|
||||
|
||||
// Bindless resources
|
||||
bindless_bind_group: Option<wgpu::BindGroup>,
|
||||
bindless_pipeline: Option<wgpu::ComputePipeline>,
|
||||
}
|
||||
|
||||
impl ComputepassState {
|
||||
/// Create and prepare all the resources needed for the computepass benchmark.
|
||||
fn new() -> Self {
|
||||
let device_state = DeviceState::new();
|
||||
|
||||
let supports_bindless = device_state.device.features().contains(
|
||||
wgpu::Features::BUFFER_BINDING_ARRAY
|
||||
| wgpu::Features::TEXTURE_BINDING_ARRAY
|
||||
| wgpu::Features::STORAGE_RESOURCE_BINDING_ARRAY
|
||||
| wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
|
||||
)
|
||||
// TODO: as of writing llvmpipe segfaults the bindless benchmark on ci
|
||||
&& device_state.adapter_info.driver != "llvmpipe";
|
||||
|
||||
// Performance gets considerably worse if the resources are shuffled.
|
||||
//
|
||||
// This more closely matches the real-world use case where resources have no
|
||||
// well defined usage order.
|
||||
let mut random = WyRand::new_seed(0x8BADF00D);
|
||||
|
||||
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
|
||||
for i in 0..TEXTURES_PER_DISPATCH {
|
||||
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||
binding: i as u32,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float { filterable: true },
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: None,
|
||||
});
|
||||
}
|
||||
for i in 0..STORAGE_TEXTURES_PER_DISPATCH {
|
||||
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||
binding: (TEXTURES_PER_DISPATCH + i) as u32,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::StorageTexture {
|
||||
access: wgpu::StorageTextureAccess::ReadWrite,
|
||||
format: wgpu::TextureFormat::R32Float,
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
},
|
||||
count: None,
|
||||
});
|
||||
}
|
||||
for i in 0..STORAGE_BUFFERS_PER_DISPATCH {
|
||||
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + i) as u32,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Buffer {
|
||||
ty: wgpu::BufferBindingType::Storage { read_only: false },
|
||||
has_dynamic_offset: false,
|
||||
min_binding_size: NonZeroU64::new(BUFFER_SIZE),
|
||||
},
|
||||
count: None,
|
||||
});
|
||||
}
|
||||
|
||||
let bind_group_layout =
|
||||
device_state
|
||||
.device
|
||||
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: None,
|
||||
entries: &bind_group_layout_entries,
|
||||
});
|
||||
|
||||
let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
|
||||
for i in 0..TEXTURE_COUNT {
|
||||
let texture = device_state
|
||||
.device
|
||||
.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some(&format!("Texture {i}")),
|
||||
size: wgpu::Extent3d {
|
||||
width: 1,
|
||||
height: 1,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8UnormSrgb,
|
||||
usage: wgpu::TextureUsages::TEXTURE_BINDING,
|
||||
view_formats: &[],
|
||||
});
|
||||
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||
label: Some(&format!("Texture View {i}")),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
random.shuffle(&mut texture_views);
|
||||
let texture_view_refs: Vec<_> = texture_views.iter().collect();
|
||||
|
||||
let mut storage_texture_views = Vec::with_capacity(STORAGE_TEXTURE_COUNT);
|
||||
for i in 0..TEXTURE_COUNT {
|
||||
let texture = device_state
|
||||
.device
|
||||
.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some(&format!("StorageTexture {i}")),
|
||||
size: wgpu::Extent3d {
|
||||
width: 1,
|
||||
height: 1,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::R32Float,
|
||||
usage: wgpu::TextureUsages::STORAGE_BINDING,
|
||||
view_formats: &[],
|
||||
});
|
||||
storage_texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||
label: Some(&format!("StorageTexture View {i}")),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
random.shuffle(&mut storage_texture_views);
|
||||
let storage_texture_view_refs: Vec<_> = storage_texture_views.iter().collect();
|
||||
|
||||
let mut storage_buffers = Vec::with_capacity(STORAGE_BUFFER_COUNT);
|
||||
for i in 0..STORAGE_BUFFER_COUNT {
|
||||
storage_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some(&format!("Buffer {i}")),
|
||||
size: BUFFER_SIZE,
|
||||
usage: wgpu::BufferUsages::STORAGE,
|
||||
mapped_at_creation: false,
|
||||
}));
|
||||
}
|
||||
random.shuffle(&mut storage_buffers);
|
||||
let storage_buffer_bindings: Vec<_> = storage_buffers
|
||||
.iter()
|
||||
.map(|b| b.as_entire_buffer_binding())
|
||||
.collect();
|
||||
|
||||
let mut bind_groups = Vec::with_capacity(DISPATCH_COUNT);
|
||||
for dispatch_idx in 0..DISPATCH_COUNT {
|
||||
let mut entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
|
||||
for tex_idx in 0..TEXTURES_PER_DISPATCH {
|
||||
entries.push(wgpu::BindGroupEntry {
|
||||
binding: tex_idx as u32,
|
||||
resource: wgpu::BindingResource::TextureView(
|
||||
&texture_views[dispatch_idx * TEXTURES_PER_DISPATCH + tex_idx],
|
||||
),
|
||||
});
|
||||
}
|
||||
for tex_idx in 0..STORAGE_TEXTURES_PER_DISPATCH {
|
||||
entries.push(wgpu::BindGroupEntry {
|
||||
binding: (TEXTURES_PER_DISPATCH + tex_idx) as u32,
|
||||
resource: wgpu::BindingResource::TextureView(
|
||||
&storage_texture_views
|
||||
[dispatch_idx * STORAGE_TEXTURES_PER_DISPATCH + tex_idx],
|
||||
),
|
||||
});
|
||||
}
|
||||
for buffer_idx in 0..STORAGE_BUFFERS_PER_DISPATCH {
|
||||
entries.push(wgpu::BindGroupEntry {
|
||||
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + buffer_idx)
|
||||
as u32,
|
||||
resource: wgpu::BindingResource::Buffer(
|
||||
storage_buffers[dispatch_idx * STORAGE_BUFFERS_PER_DISPATCH + buffer_idx]
|
||||
.as_entire_buffer_binding(),
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
bind_groups.push(
|
||||
device_state
|
||||
.device
|
||||
.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: None,
|
||||
layout: &bind_group_layout,
|
||||
entries: &entries,
|
||||
}),
|
||||
);
|
||||
}
|
||||
random.shuffle(&mut bind_groups);
|
||||
|
||||
let sm = device_state
|
||||
.device
|
||||
.create_shader_module(wgpu::include_wgsl!("computepass.wgsl"));
|
||||
|
||||
let pipeline_layout =
|
||||
device_state
|
||||
.device
|
||||
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||
label: None,
|
||||
bind_group_layouts: &[&bind_group_layout],
|
||||
push_constant_ranges: &[],
|
||||
});
|
||||
|
||||
let pipeline =
|
||||
device_state
|
||||
.device
|
||||
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||
label: Some("Compute Pipeline"),
|
||||
layout: Some(&pipeline_layout),
|
||||
module: &sm,
|
||||
entry_point: "cs_main",
|
||||
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||
cache: None,
|
||||
});
|
||||
|
||||
let (bindless_bind_group, bindless_pipeline) = if supports_bindless {
|
||||
let bindless_bind_group_layout =
|
||||
device_state
|
||||
.device
|
||||
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: None,
|
||||
entries: &[
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding: 0,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float {
|
||||
filterable: true,
|
||||
},
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
|
||||
},
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding: 1,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::StorageTexture {
|
||||
access: wgpu::StorageTextureAccess::ReadWrite,
|
||||
format: wgpu::TextureFormat::R32Float,
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
},
|
||||
count: Some(NonZeroU32::new(STORAGE_TEXTURE_COUNT as u32).unwrap()),
|
||||
},
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding: 2,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Buffer {
|
||||
ty: wgpu::BufferBindingType::Storage { read_only: false },
|
||||
has_dynamic_offset: false,
|
||||
min_binding_size: std::num::NonZeroU64::new(BUFFER_SIZE),
|
||||
},
|
||||
count: Some(NonZeroU32::new(STORAGE_BUFFER_COUNT as u32).unwrap()),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let bindless_bind_group =
|
||||
device_state
|
||||
.device
|
||||
.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: None,
|
||||
layout: &bindless_bind_group_layout,
|
||||
entries: &[
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 0,
|
||||
resource: wgpu::BindingResource::TextureViewArray(
|
||||
&texture_view_refs[..DISPATCH_COUNT_BINDLESS],
|
||||
),
|
||||
},
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 1,
|
||||
resource: wgpu::BindingResource::TextureViewArray(
|
||||
&storage_texture_view_refs[..DISPATCH_COUNT_BINDLESS],
|
||||
),
|
||||
},
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 2,
|
||||
resource: wgpu::BindingResource::BufferArray(
|
||||
&storage_buffer_bindings[..DISPATCH_COUNT_BINDLESS],
|
||||
),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let bindless_sm = device_state
|
||||
.device
|
||||
.create_shader_module(wgpu::include_wgsl!("computepass-bindless.wgsl"));
|
||||
|
||||
let bindless_pipeline_layout =
|
||||
device_state
|
||||
.device
|
||||
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||
label: None,
|
||||
bind_group_layouts: &[&bindless_bind_group_layout],
|
||||
push_constant_ranges: &[],
|
||||
});
|
||||
|
||||
let bindless_pipeline =
|
||||
device_state
|
||||
.device
|
||||
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||
label: Some("Compute Pipeline bindless"),
|
||||
layout: Some(&bindless_pipeline_layout),
|
||||
module: &bindless_sm,
|
||||
entry_point: "cs_main",
|
||||
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||
cache: None,
|
||||
});
|
||||
|
||||
(Some(bindless_bind_group), Some(bindless_pipeline))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
Self {
|
||||
device_state,
|
||||
pipeline,
|
||||
bind_groups,
|
||||
|
||||
bindless_bind_group,
|
||||
bindless_pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
|
||||
profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
|
||||
|
||||
let dispatch_per_pass = DISPATCH_COUNT / total_passes;
|
||||
|
||||
let mut encoder = self
|
||||
.device_state
|
||||
.device
|
||||
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
|
||||
|
||||
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||
label: None,
|
||||
timestamp_writes: None,
|
||||
});
|
||||
|
||||
let start_idx = pass_number * dispatch_per_pass;
|
||||
let end_idx = start_idx + dispatch_per_pass;
|
||||
for dispatch_idx in start_idx..end_idx {
|
||||
compute_pass.set_pipeline(&self.pipeline);
|
||||
compute_pass.set_bind_group(0, &self.bind_groups[dispatch_idx], &[]);
|
||||
compute_pass.dispatch_workgroups(1, 1, 1);
|
||||
}
|
||||
|
||||
drop(compute_pass);
|
||||
|
||||
encoder.finish()
|
||||
}
|
||||
|
||||
fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
|
||||
profiling::scope!("Bindless Computepass");
|
||||
|
||||
let mut encoder = self
|
||||
.device_state
|
||||
.device
|
||||
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
|
||||
|
||||
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||
label: None,
|
||||
timestamp_writes: None,
|
||||
});
|
||||
|
||||
compute_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
|
||||
compute_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
|
||||
for _ in 0..DISPATCH_COUNT_BINDLESS {
|
||||
compute_pass.dispatch_workgroups(1, 1, 1);
|
||||
}
|
||||
|
||||
drop(compute_pass);
|
||||
|
||||
encoder.finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench(ctx: &mut Criterion) {
|
||||
let state = Lazy::new(ComputepassState::new);
|
||||
|
||||
// Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
|
||||
let mut group = ctx.benchmark_group("Computepass: Single Threaded");
|
||||
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
|
||||
|
||||
for time_submit in [false, true] {
|
||||
for cpasses in [1, 2, 4, 8] {
|
||||
let dispatch_per_pass = DISPATCH_COUNT / cpasses;
|
||||
|
||||
let label = if time_submit {
|
||||
"Submit Time"
|
||||
} else {
|
||||
"Computepass Time"
|
||||
};
|
||||
|
||||
group.bench_function(
|
||||
&format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
|
||||
|b| {
|
||||
Lazy::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let mut start = Instant::now();
|
||||
|
||||
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
|
||||
for i in 0..cpasses {
|
||||
buffers.push(state.run_subpass(i, cpasses));
|
||||
}
|
||||
|
||||
if time_submit {
|
||||
start = Instant::now();
|
||||
} else {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
|
||||
if time_submit {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
|
||||
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
|
||||
|
||||
for threads in [2, 4, 8] {
|
||||
let dispatch_per_pass = DISPATCH_COUNT / threads;
|
||||
group.bench_function(
|
||||
&format!("{threads} threads x {dispatch_per_pass} dispatch"),
|
||||
|b| {
|
||||
Lazy::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|i| state.run_subpass(i, threads))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Computepass: Bindless");
|
||||
group.throughput(Throughput::Elements(DISPATCH_COUNT_BINDLESS as _));
|
||||
|
||||
group.bench_function(&format!("{DISPATCH_COUNT_BINDLESS} dispatch"), |b| {
|
||||
Lazy::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
|
||||
// Need bindless to run this benchmark
|
||||
if state.bindless_bind_group.is_none() {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffer = state.run_bindless_pass();
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit([buffer]);
|
||||
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
|
||||
ctx.bench_function(
|
||||
&format!(
|
||||
"Computepass: Empty Submit with {} Resources",
|
||||
TEXTURE_COUNT + STORAGE_TEXTURE_COUNT + STORAGE_BUFFER_COUNT
|
||||
),
|
||||
|b| {
|
||||
Lazy::force(&state);
|
||||
|
||||
b.iter(|| state.device_state.queue.submit([]));
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = computepass;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||
targets = run_bench,
|
||||
}
|
26
benches/benches/computepass.wgsl
Normal file
26
benches/benches/computepass.wgsl
Normal file
@ -0,0 +1,26 @@
|
||||
@group(0) @binding(0)
|
||||
var tex_0: texture_2d<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var tex_1: texture_2d<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var image_0: texture_storage_2d<r32float, read_write>;
|
||||
|
||||
@group(0) @binding(3)
|
||||
var image_1: texture_storage_2d<r32float, read_write>;
|
||||
|
||||
@group(0) @binding(4)
|
||||
var<storage, read_write> buffer0 : array<vec4f>;
|
||||
|
||||
@group(0) @binding(5)
|
||||
var<storage, read_write> buffer1 : array<vec4f>;
|
||||
|
||||
@compute
|
||||
@workgroup_size(16)
|
||||
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
|
||||
let tex = textureLoad(tex_0, vec2u(0), 0) + textureLoad(tex_1, vec2u(0), 0);
|
||||
let image = textureLoad(image_0, vec2u(0)) + textureLoad(image_1, vec2u(0));
|
||||
buffer0[0] = tex.rrrr;
|
||||
buffer1[0] = image.rrrr;
|
||||
}
|
@ -10,7 +10,12 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
|
||||
use crate::DeviceState;
|
||||
|
||||
#[cfg(test)]
|
||||
const DRAW_COUNT: usize = 8; // Running with up to 8 threads.
|
||||
|
||||
#[cfg(not(test))]
|
||||
const DRAW_COUNT: usize = 10_000;
|
||||
|
||||
// Must match the number of textures in the renderpass.wgsl shader
|
||||
const TEXTURES_PER_DRAW: usize = 7;
|
||||
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
|
||||
|
@ -1,6 +1,7 @@
|
||||
use criterion::criterion_main;
|
||||
use pollster::block_on;
|
||||
|
||||
mod computepass;
|
||||
mod renderpass;
|
||||
mod resource_creation;
|
||||
mod shader;
|
||||
@ -45,7 +46,7 @@ impl DeviceState {
|
||||
required_features: adapter.features(),
|
||||
required_limits: adapter.limits(),
|
||||
memory_hints: wgpu::MemoryHints::Performance,
|
||||
label: Some("RenderPass Device"),
|
||||
label: Some("Compute/RenderPass Device"),
|
||||
},
|
||||
None,
|
||||
))
|
||||
@ -61,6 +62,7 @@ impl DeviceState {
|
||||
|
||||
criterion_main!(
|
||||
renderpass::renderpass,
|
||||
computepass::computepass,
|
||||
resource_creation::resource_creation,
|
||||
shader::shader
|
||||
);
|
||||
|
@ -66,7 +66,7 @@ pub enum CreateBindGroupLayoutError {
|
||||
},
|
||||
#[error(transparent)]
|
||||
TooManyBindings(BindingTypeMaxCountError),
|
||||
#[error("Binding index {binding} is greater than the maximum index {maximum}")]
|
||||
#[error("Binding index {binding} is greater than the maximum number {maximum}")]
|
||||
InvalidBindingIndex { binding: u32, maximum: u32 },
|
||||
#[error("Invalid visibility {0:?}")]
|
||||
InvalidVisibility(wgt::ShaderStages),
|
||||
|
Loading…
Reference in New Issue
Block a user