mirror of
https://github.com/gfx-rs/wgpu.git
synced 2024-11-21 22:33:49 +00:00
Compute pass benchmark (#5767)
Adds a benchmark for compute pass recording, very similar to what we have for render passes.
This commit is contained in:
parent
586215ab2e
commit
d3edbc57a9
@ -183,6 +183,7 @@ By @teoxoy in [#5901](https://github.com/gfx-rs/wgpu/pull/5901)
|
|||||||
- Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
|
- Unconsumed vertex outputs are now always allowed. Removed `StageError::InputNotConsumed`, `Features::SHADER_UNUSED_VERTEX_OUTPUT`, and associated validation. By @Imberflur in [#5531](https://github.com/gfx-rs/wgpu/pull/5531)
|
||||||
- Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
|
- Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
|
||||||
- `wgpu::Error` is now `Sync`, making it possible to be wrapped in `anyhow::Error` or `eyre::Report`. By @nolanderc in [#5820](https://github.com/gfx-rs/wgpu/pull/5820)
|
- `wgpu::Error` is now `Sync`, making it possible to be wrapped in `anyhow::Error` or `eyre::Report`. By @nolanderc in [#5820](https://github.com/gfx-rs/wgpu/pull/5820)
|
||||||
|
- Added benchmark suite. By @cwfitzgerald in [#5694](https://github.com/gfx-rs/wgpu/pull/5694), compute passes by @wumpf in [#5767](https://github.com/gfx-rs/wgpu/pull/5767)
|
||||||
|
|
||||||
#### Metal
|
#### Metal
|
||||||
- Removed the `link` Cargo feature.
|
- Removed the `link` Cargo feature.
|
||||||
|
@ -43,4 +43,4 @@ pollster.workspace = true
|
|||||||
profiling.workspace = true
|
profiling.workspace = true
|
||||||
rayon.workspace = true
|
rayon.workspace = true
|
||||||
tracy-client = { workspace = true, optional = true }
|
tracy-client = { workspace = true, optional = true }
|
||||||
wgpu.workspace = true
|
wgpu = { workspace = true, features = ["wgsl"] }
|
||||||
|
@ -24,6 +24,21 @@ By default it measures 10k draw calls, with 90k total resources.
|
|||||||
|
|
||||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||||
the render pass into multiple passes over multiple command buffers.
|
the render pass into multiple passes over multiple command buffers.
|
||||||
|
If available, it also tests a bindless approach, binding all textures at once instead of switching
|
||||||
|
the bind group for every draw call.
|
||||||
|
|
||||||
|
#### `Computepass`
|
||||||
|
|
||||||
|
This benchmark measures the performance of recording and submitting a compute pass with a large
|
||||||
|
number of dispatches and resources.
|
||||||
|
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
|
||||||
|
|
||||||
|
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||||
|
the compute pass into multiple passes over multiple command buffers.
|
||||||
|
If available, it also tests a bindless approach, binding all resources at once instead of switching
|
||||||
|
the bind group for every draw call.
|
||||||
|
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
|
||||||
|
|
||||||
|
|
||||||
#### `Resource Creation`
|
#### `Resource Creation`
|
||||||
|
|
||||||
|
26
benches/benches/computepass-bindless.wgsl
Normal file
26
benches/benches/computepass-bindless.wgsl
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
@group(0) @binding(0)
|
||||||
|
var tex: binding_array<texture_2d<f32>>;
|
||||||
|
|
||||||
|
@group(0) @binding(1)
|
||||||
|
// TODO(https://github.com/gfx-rs/wgpu/issues/5765): The extra whitespace between the angle brackets is needed to workaround a parsing bug.
|
||||||
|
var images: binding_array<texture_storage_2d<r32float, read_write> >;
|
||||||
|
struct BufferElement {
|
||||||
|
element: vec4f,
|
||||||
|
}
|
||||||
|
|
||||||
|
@group(0) @binding(2)
|
||||||
|
var<storage, read_write> buffers: binding_array<BufferElement>;
|
||||||
|
|
||||||
|
@compute
|
||||||
|
@workgroup_size(16)
|
||||||
|
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
|
||||||
|
let offset = global_invocation_id.x; // Would be nice to offset this dynamically (it's just 0 always in the current setup)
|
||||||
|
|
||||||
|
let idx0 = offset * 2 + 0;
|
||||||
|
let idx1 = offset * 2 + 1;
|
||||||
|
|
||||||
|
let tex = textureLoad(tex[idx0], vec2u(0), 0) + textureLoad(tex[idx0], vec2u(0), 0);
|
||||||
|
let image = textureLoad(images[idx0], vec2u(0)) + textureLoad(images[idx1], vec2u(0));
|
||||||
|
buffers[idx0].element = tex.rrrr;
|
||||||
|
buffers[idx1].element = image.rrrr;
|
||||||
|
}
|
574
benches/benches/computepass.rs
Normal file
574
benches/benches/computepass.rs
Normal file
@ -0,0 +1,574 @@
|
|||||||
|
use std::{
|
||||||
|
num::{NonZeroU32, NonZeroU64},
|
||||||
|
time::{Duration, Instant},
|
||||||
|
};
|
||||||
|
|
||||||
|
use criterion::{criterion_group, Criterion, Throughput};
|
||||||
|
use nanorand::{Rng, WyRand};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||||
|
|
||||||
|
use crate::DeviceState;
|
||||||
|
|
||||||
|
#[cfg(not(test))]
|
||||||
|
const DISPATCH_COUNT: usize = 10_000;
|
||||||
|
#[cfg(test)]
|
||||||
|
const DISPATCH_COUNT: usize = 8; // Running with up to 8 threads.
|
||||||
|
|
||||||
|
// Currently bindless is _much_ slower than with regularly resources,
|
||||||
|
// since wgpu needs to issues barriers for all resources between each dispatch for all read/write textures & buffers.
|
||||||
|
// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
|
||||||
|
// resources as the regular benchmark.
|
||||||
|
// For details see https://github.com/gfx-rs/wgpu/issues/5766
|
||||||
|
#[cfg(not(test))]
|
||||||
|
const DISPATCH_COUNT_BINDLESS: usize = 1_000;
|
||||||
|
#[cfg(test)]
|
||||||
|
const DISPATCH_COUNT_BINDLESS: usize = 8; // Running with up to 8 threads.
|
||||||
|
|
||||||
|
// Must match the number of textures in the computepass.wgsl shader
|
||||||
|
const TEXTURES_PER_DISPATCH: usize = 2;
|
||||||
|
const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
|
||||||
|
const STORAGE_BUFFERS_PER_DISPATCH: usize = 2;
|
||||||
|
|
||||||
|
const TEXTURE_COUNT: usize = DISPATCH_COUNT * TEXTURES_PER_DISPATCH;
|
||||||
|
const STORAGE_TEXTURE_COUNT: usize = DISPATCH_COUNT * STORAGE_TEXTURES_PER_DISPATCH;
|
||||||
|
const STORAGE_BUFFER_COUNT: usize = DISPATCH_COUNT * STORAGE_BUFFERS_PER_DISPATCH;
|
||||||
|
|
||||||
|
const BUFFER_SIZE: u64 = 16;
|
||||||
|
|
||||||
|
struct ComputepassState {
|
||||||
|
device_state: DeviceState,
|
||||||
|
pipeline: wgpu::ComputePipeline,
|
||||||
|
bind_groups: Vec<wgpu::BindGroup>,
|
||||||
|
|
||||||
|
// Bindless resources
|
||||||
|
bindless_bind_group: Option<wgpu::BindGroup>,
|
||||||
|
bindless_pipeline: Option<wgpu::ComputePipeline>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ComputepassState {
|
||||||
|
/// Create and prepare all the resources needed for the computepass benchmark.
|
||||||
|
fn new() -> Self {
|
||||||
|
let device_state = DeviceState::new();
|
||||||
|
|
||||||
|
let supports_bindless = device_state.device.features().contains(
|
||||||
|
wgpu::Features::BUFFER_BINDING_ARRAY
|
||||||
|
| wgpu::Features::TEXTURE_BINDING_ARRAY
|
||||||
|
| wgpu::Features::STORAGE_RESOURCE_BINDING_ARRAY
|
||||||
|
| wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
|
||||||
|
)
|
||||||
|
// TODO: as of writing llvmpipe segfaults the bindless benchmark on ci
|
||||||
|
&& device_state.adapter_info.driver != "llvmpipe";
|
||||||
|
|
||||||
|
// Performance gets considerably worse if the resources are shuffled.
|
||||||
|
//
|
||||||
|
// This more closely matches the real-world use case where resources have no
|
||||||
|
// well defined usage order.
|
||||||
|
let mut random = WyRand::new_seed(0x8BADF00D);
|
||||||
|
|
||||||
|
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
|
||||||
|
for i in 0..TEXTURES_PER_DISPATCH {
|
||||||
|
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: i as u32,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::Texture {
|
||||||
|
sample_type: wgpu::TextureSampleType::Float { filterable: true },
|
||||||
|
view_dimension: wgpu::TextureViewDimension::D2,
|
||||||
|
multisampled: false,
|
||||||
|
},
|
||||||
|
count: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for i in 0..STORAGE_TEXTURES_PER_DISPATCH {
|
||||||
|
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: (TEXTURES_PER_DISPATCH + i) as u32,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::StorageTexture {
|
||||||
|
access: wgpu::StorageTextureAccess::ReadWrite,
|
||||||
|
format: wgpu::TextureFormat::R32Float,
|
||||||
|
view_dimension: wgpu::TextureViewDimension::D2,
|
||||||
|
},
|
||||||
|
count: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for i in 0..STORAGE_BUFFERS_PER_DISPATCH {
|
||||||
|
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + i) as u32,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::Buffer {
|
||||||
|
ty: wgpu::BufferBindingType::Storage { read_only: false },
|
||||||
|
has_dynamic_offset: false,
|
||||||
|
min_binding_size: NonZeroU64::new(BUFFER_SIZE),
|
||||||
|
},
|
||||||
|
count: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let bind_group_layout =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||||
|
label: None,
|
||||||
|
entries: &bind_group_layout_entries,
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
|
||||||
|
for i in 0..TEXTURE_COUNT {
|
||||||
|
let texture = device_state
|
||||||
|
.device
|
||||||
|
.create_texture(&wgpu::TextureDescriptor {
|
||||||
|
label: Some(&format!("Texture {i}")),
|
||||||
|
size: wgpu::Extent3d {
|
||||||
|
width: 1,
|
||||||
|
height: 1,
|
||||||
|
depth_or_array_layers: 1,
|
||||||
|
},
|
||||||
|
mip_level_count: 1,
|
||||||
|
sample_count: 1,
|
||||||
|
dimension: wgpu::TextureDimension::D2,
|
||||||
|
format: wgpu::TextureFormat::Rgba8UnormSrgb,
|
||||||
|
usage: wgpu::TextureUsages::TEXTURE_BINDING,
|
||||||
|
view_formats: &[],
|
||||||
|
});
|
||||||
|
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||||
|
label: Some(&format!("Texture View {i}")),
|
||||||
|
..Default::default()
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
random.shuffle(&mut texture_views);
|
||||||
|
let texture_view_refs: Vec<_> = texture_views.iter().collect();
|
||||||
|
|
||||||
|
let mut storage_texture_views = Vec::with_capacity(STORAGE_TEXTURE_COUNT);
|
||||||
|
for i in 0..TEXTURE_COUNT {
|
||||||
|
let texture = device_state
|
||||||
|
.device
|
||||||
|
.create_texture(&wgpu::TextureDescriptor {
|
||||||
|
label: Some(&format!("StorageTexture {i}")),
|
||||||
|
size: wgpu::Extent3d {
|
||||||
|
width: 1,
|
||||||
|
height: 1,
|
||||||
|
depth_or_array_layers: 1,
|
||||||
|
},
|
||||||
|
mip_level_count: 1,
|
||||||
|
sample_count: 1,
|
||||||
|
dimension: wgpu::TextureDimension::D2,
|
||||||
|
format: wgpu::TextureFormat::R32Float,
|
||||||
|
usage: wgpu::TextureUsages::STORAGE_BINDING,
|
||||||
|
view_formats: &[],
|
||||||
|
});
|
||||||
|
storage_texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||||
|
label: Some(&format!("StorageTexture View {i}")),
|
||||||
|
..Default::default()
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
random.shuffle(&mut storage_texture_views);
|
||||||
|
let storage_texture_view_refs: Vec<_> = storage_texture_views.iter().collect();
|
||||||
|
|
||||||
|
let mut storage_buffers = Vec::with_capacity(STORAGE_BUFFER_COUNT);
|
||||||
|
for i in 0..STORAGE_BUFFER_COUNT {
|
||||||
|
storage_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||||
|
label: Some(&format!("Buffer {i}")),
|
||||||
|
size: BUFFER_SIZE,
|
||||||
|
usage: wgpu::BufferUsages::STORAGE,
|
||||||
|
mapped_at_creation: false,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
random.shuffle(&mut storage_buffers);
|
||||||
|
let storage_buffer_bindings: Vec<_> = storage_buffers
|
||||||
|
.iter()
|
||||||
|
.map(|b| b.as_entire_buffer_binding())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut bind_groups = Vec::with_capacity(DISPATCH_COUNT);
|
||||||
|
for dispatch_idx in 0..DISPATCH_COUNT {
|
||||||
|
let mut entries = Vec::with_capacity(TEXTURES_PER_DISPATCH);
|
||||||
|
for tex_idx in 0..TEXTURES_PER_DISPATCH {
|
||||||
|
entries.push(wgpu::BindGroupEntry {
|
||||||
|
binding: tex_idx as u32,
|
||||||
|
resource: wgpu::BindingResource::TextureView(
|
||||||
|
&texture_views[dispatch_idx * TEXTURES_PER_DISPATCH + tex_idx],
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for tex_idx in 0..STORAGE_TEXTURES_PER_DISPATCH {
|
||||||
|
entries.push(wgpu::BindGroupEntry {
|
||||||
|
binding: (TEXTURES_PER_DISPATCH + tex_idx) as u32,
|
||||||
|
resource: wgpu::BindingResource::TextureView(
|
||||||
|
&storage_texture_views
|
||||||
|
[dispatch_idx * STORAGE_TEXTURES_PER_DISPATCH + tex_idx],
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for buffer_idx in 0..STORAGE_BUFFERS_PER_DISPATCH {
|
||||||
|
entries.push(wgpu::BindGroupEntry {
|
||||||
|
binding: (TEXTURES_PER_DISPATCH + STORAGE_BUFFERS_PER_DISPATCH + buffer_idx)
|
||||||
|
as u32,
|
||||||
|
resource: wgpu::BindingResource::Buffer(
|
||||||
|
storage_buffers[dispatch_idx * STORAGE_BUFFERS_PER_DISPATCH + buffer_idx]
|
||||||
|
.as_entire_buffer_binding(),
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
bind_groups.push(
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||||
|
label: None,
|
||||||
|
layout: &bind_group_layout,
|
||||||
|
entries: &entries,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
random.shuffle(&mut bind_groups);
|
||||||
|
|
||||||
|
let sm = device_state
|
||||||
|
.device
|
||||||
|
.create_shader_module(wgpu::include_wgsl!("computepass.wgsl"));
|
||||||
|
|
||||||
|
let pipeline_layout =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||||
|
label: None,
|
||||||
|
bind_group_layouts: &[&bind_group_layout],
|
||||||
|
push_constant_ranges: &[],
|
||||||
|
});
|
||||||
|
|
||||||
|
let pipeline =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||||
|
label: Some("Compute Pipeline"),
|
||||||
|
layout: Some(&pipeline_layout),
|
||||||
|
module: &sm,
|
||||||
|
entry_point: "cs_main",
|
||||||
|
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||||
|
cache: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
let (bindless_bind_group, bindless_pipeline) = if supports_bindless {
|
||||||
|
let bindless_bind_group_layout =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||||
|
label: None,
|
||||||
|
entries: &[
|
||||||
|
wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: 0,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::Texture {
|
||||||
|
sample_type: wgpu::TextureSampleType::Float {
|
||||||
|
filterable: true,
|
||||||
|
},
|
||||||
|
view_dimension: wgpu::TextureViewDimension::D2,
|
||||||
|
multisampled: false,
|
||||||
|
},
|
||||||
|
count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
|
||||||
|
},
|
||||||
|
wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: 1,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::StorageTexture {
|
||||||
|
access: wgpu::StorageTextureAccess::ReadWrite,
|
||||||
|
format: wgpu::TextureFormat::R32Float,
|
||||||
|
view_dimension: wgpu::TextureViewDimension::D2,
|
||||||
|
},
|
||||||
|
count: Some(NonZeroU32::new(STORAGE_TEXTURE_COUNT as u32).unwrap()),
|
||||||
|
},
|
||||||
|
wgpu::BindGroupLayoutEntry {
|
||||||
|
binding: 2,
|
||||||
|
visibility: wgpu::ShaderStages::COMPUTE,
|
||||||
|
ty: wgpu::BindingType::Buffer {
|
||||||
|
ty: wgpu::BufferBindingType::Storage { read_only: false },
|
||||||
|
has_dynamic_offset: false,
|
||||||
|
min_binding_size: std::num::NonZeroU64::new(BUFFER_SIZE),
|
||||||
|
},
|
||||||
|
count: Some(NonZeroU32::new(STORAGE_BUFFER_COUNT as u32).unwrap()),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
let bindless_bind_group =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||||
|
label: None,
|
||||||
|
layout: &bindless_bind_group_layout,
|
||||||
|
entries: &[
|
||||||
|
wgpu::BindGroupEntry {
|
||||||
|
binding: 0,
|
||||||
|
resource: wgpu::BindingResource::TextureViewArray(
|
||||||
|
&texture_view_refs[..DISPATCH_COUNT_BINDLESS],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
wgpu::BindGroupEntry {
|
||||||
|
binding: 1,
|
||||||
|
resource: wgpu::BindingResource::TextureViewArray(
|
||||||
|
&storage_texture_view_refs[..DISPATCH_COUNT_BINDLESS],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
wgpu::BindGroupEntry {
|
||||||
|
binding: 2,
|
||||||
|
resource: wgpu::BindingResource::BufferArray(
|
||||||
|
&storage_buffer_bindings[..DISPATCH_COUNT_BINDLESS],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
let bindless_sm = device_state
|
||||||
|
.device
|
||||||
|
.create_shader_module(wgpu::include_wgsl!("computepass-bindless.wgsl"));
|
||||||
|
|
||||||
|
let bindless_pipeline_layout =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||||
|
label: None,
|
||||||
|
bind_group_layouts: &[&bindless_bind_group_layout],
|
||||||
|
push_constant_ranges: &[],
|
||||||
|
});
|
||||||
|
|
||||||
|
let bindless_pipeline =
|
||||||
|
device_state
|
||||||
|
.device
|
||||||
|
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||||
|
label: Some("Compute Pipeline bindless"),
|
||||||
|
layout: Some(&bindless_pipeline_layout),
|
||||||
|
module: &bindless_sm,
|
||||||
|
entry_point: "cs_main",
|
||||||
|
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||||
|
cache: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
(Some(bindless_bind_group), Some(bindless_pipeline))
|
||||||
|
} else {
|
||||||
|
(None, None)
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
device_state,
|
||||||
|
pipeline,
|
||||||
|
bind_groups,
|
||||||
|
|
||||||
|
bindless_bind_group,
|
||||||
|
bindless_pipeline,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
|
||||||
|
profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
|
||||||
|
|
||||||
|
let dispatch_per_pass = DISPATCH_COUNT / total_passes;
|
||||||
|
|
||||||
|
let mut encoder = self
|
||||||
|
.device_state
|
||||||
|
.device
|
||||||
|
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
|
||||||
|
|
||||||
|
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||||
|
label: None,
|
||||||
|
timestamp_writes: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
let start_idx = pass_number * dispatch_per_pass;
|
||||||
|
let end_idx = start_idx + dispatch_per_pass;
|
||||||
|
for dispatch_idx in start_idx..end_idx {
|
||||||
|
compute_pass.set_pipeline(&self.pipeline);
|
||||||
|
compute_pass.set_bind_group(0, &self.bind_groups[dispatch_idx], &[]);
|
||||||
|
compute_pass.dispatch_workgroups(1, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(compute_pass);
|
||||||
|
|
||||||
|
encoder.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
|
||||||
|
profiling::scope!("Bindless Computepass");
|
||||||
|
|
||||||
|
let mut encoder = self
|
||||||
|
.device_state
|
||||||
|
.device
|
||||||
|
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
|
||||||
|
|
||||||
|
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||||
|
label: None,
|
||||||
|
timestamp_writes: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
compute_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
|
||||||
|
compute_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
|
||||||
|
for _ in 0..DISPATCH_COUNT_BINDLESS {
|
||||||
|
compute_pass.dispatch_workgroups(1, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(compute_pass);
|
||||||
|
|
||||||
|
encoder.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_bench(ctx: &mut Criterion) {
|
||||||
|
let state = Lazy::new(ComputepassState::new);
|
||||||
|
|
||||||
|
// Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
|
||||||
|
let mut group = ctx.benchmark_group("Computepass: Single Threaded");
|
||||||
|
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
|
||||||
|
|
||||||
|
for time_submit in [false, true] {
|
||||||
|
for cpasses in [1, 2, 4, 8] {
|
||||||
|
let dispatch_per_pass = DISPATCH_COUNT / cpasses;
|
||||||
|
|
||||||
|
let label = if time_submit {
|
||||||
|
"Submit Time"
|
||||||
|
} else {
|
||||||
|
"Computepass Time"
|
||||||
|
};
|
||||||
|
|
||||||
|
group.bench_function(
|
||||||
|
&format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
|
||||||
|
|b| {
|
||||||
|
Lazy::force(&state);
|
||||||
|
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
profiling::scope!("benchmark invocation");
|
||||||
|
|
||||||
|
let mut duration = Duration::ZERO;
|
||||||
|
|
||||||
|
for _ in 0..iters {
|
||||||
|
profiling::scope!("benchmark iteration");
|
||||||
|
|
||||||
|
let mut start = Instant::now();
|
||||||
|
|
||||||
|
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
|
||||||
|
for i in 0..cpasses {
|
||||||
|
buffers.push(state.run_subpass(i, cpasses));
|
||||||
|
}
|
||||||
|
|
||||||
|
if time_submit {
|
||||||
|
start = Instant::now();
|
||||||
|
} else {
|
||||||
|
duration += start.elapsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
state.device_state.queue.submit(buffers);
|
||||||
|
|
||||||
|
if time_submit {
|
||||||
|
duration += start.elapsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
|
||||||
|
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
|
||||||
|
let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
|
||||||
|
group.throughput(Throughput::Elements(DISPATCH_COUNT as _));
|
||||||
|
|
||||||
|
for threads in [2, 4, 8] {
|
||||||
|
let dispatch_per_pass = DISPATCH_COUNT / threads;
|
||||||
|
group.bench_function(
|
||||||
|
&format!("{threads} threads x {dispatch_per_pass} dispatch"),
|
||||||
|
|b| {
|
||||||
|
Lazy::force(&state);
|
||||||
|
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
profiling::scope!("benchmark invocation");
|
||||||
|
|
||||||
|
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||||
|
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||||
|
return Duration::from_secs_f32(1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut duration = Duration::ZERO;
|
||||||
|
|
||||||
|
for _ in 0..iters {
|
||||||
|
profiling::scope!("benchmark iteration");
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let buffers = (0..threads)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| state.run_subpass(i, threads))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
duration += start.elapsed();
|
||||||
|
|
||||||
|
state.device_state.queue.submit(buffers);
|
||||||
|
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
|
||||||
|
// Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
|
||||||
|
let mut group = ctx.benchmark_group("Computepass: Bindless");
|
||||||
|
group.throughput(Throughput::Elements(DISPATCH_COUNT_BINDLESS as _));
|
||||||
|
|
||||||
|
group.bench_function(&format!("{DISPATCH_COUNT_BINDLESS} dispatch"), |b| {
|
||||||
|
Lazy::force(&state);
|
||||||
|
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
profiling::scope!("benchmark invocation");
|
||||||
|
|
||||||
|
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||||
|
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||||
|
return Duration::from_secs_f32(1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Need bindless to run this benchmark
|
||||||
|
if state.bindless_bind_group.is_none() {
|
||||||
|
return Duration::from_secs_f32(1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut duration = Duration::ZERO;
|
||||||
|
|
||||||
|
for _ in 0..iters {
|
||||||
|
profiling::scope!("benchmark iteration");
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let buffer = state.run_bindless_pass();
|
||||||
|
|
||||||
|
duration += start.elapsed();
|
||||||
|
|
||||||
|
state.device_state.queue.submit([buffer]);
|
||||||
|
state.device_state.device.poll(wgpu::Maintain::Wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.finish();
|
||||||
|
|
||||||
|
ctx.bench_function(
|
||||||
|
&format!(
|
||||||
|
"Computepass: Empty Submit with {} Resources",
|
||||||
|
TEXTURE_COUNT + STORAGE_TEXTURE_COUNT + STORAGE_BUFFER_COUNT
|
||||||
|
),
|
||||||
|
|b| {
|
||||||
|
Lazy::force(&state);
|
||||||
|
|
||||||
|
b.iter(|| state.device_state.queue.submit([]));
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group! {
|
||||||
|
name = computepass;
|
||||||
|
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||||
|
targets = run_bench,
|
||||||
|
}
|
26
benches/benches/computepass.wgsl
Normal file
26
benches/benches/computepass.wgsl
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
@group(0) @binding(0)
|
||||||
|
var tex_0: texture_2d<f32>;
|
||||||
|
|
||||||
|
@group(0) @binding(1)
|
||||||
|
var tex_1: texture_2d<f32>;
|
||||||
|
|
||||||
|
@group(0) @binding(2)
|
||||||
|
var image_0: texture_storage_2d<r32float, read_write>;
|
||||||
|
|
||||||
|
@group(0) @binding(3)
|
||||||
|
var image_1: texture_storage_2d<r32float, read_write>;
|
||||||
|
|
||||||
|
@group(0) @binding(4)
|
||||||
|
var<storage, read_write> buffer0 : array<vec4f>;
|
||||||
|
|
||||||
|
@group(0) @binding(5)
|
||||||
|
var<storage, read_write> buffer1 : array<vec4f>;
|
||||||
|
|
||||||
|
@compute
|
||||||
|
@workgroup_size(16)
|
||||||
|
fn cs_main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
|
||||||
|
let tex = textureLoad(tex_0, vec2u(0), 0) + textureLoad(tex_1, vec2u(0), 0);
|
||||||
|
let image = textureLoad(image_0, vec2u(0)) + textureLoad(image_1, vec2u(0));
|
||||||
|
buffer0[0] = tex.rrrr;
|
||||||
|
buffer1[0] = image.rrrr;
|
||||||
|
}
|
@ -10,7 +10,12 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
|||||||
|
|
||||||
use crate::DeviceState;
|
use crate::DeviceState;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
const DRAW_COUNT: usize = 8; // Running with up to 8 threads.
|
||||||
|
|
||||||
|
#[cfg(not(test))]
|
||||||
const DRAW_COUNT: usize = 10_000;
|
const DRAW_COUNT: usize = 10_000;
|
||||||
|
|
||||||
// Must match the number of textures in the renderpass.wgsl shader
|
// Must match the number of textures in the renderpass.wgsl shader
|
||||||
const TEXTURES_PER_DRAW: usize = 7;
|
const TEXTURES_PER_DRAW: usize = 7;
|
||||||
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
|
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use criterion::criterion_main;
|
use criterion::criterion_main;
|
||||||
use pollster::block_on;
|
use pollster::block_on;
|
||||||
|
|
||||||
|
mod computepass;
|
||||||
mod renderpass;
|
mod renderpass;
|
||||||
mod resource_creation;
|
mod resource_creation;
|
||||||
mod shader;
|
mod shader;
|
||||||
@ -45,7 +46,7 @@ impl DeviceState {
|
|||||||
required_features: adapter.features(),
|
required_features: adapter.features(),
|
||||||
required_limits: adapter.limits(),
|
required_limits: adapter.limits(),
|
||||||
memory_hints: wgpu::MemoryHints::Performance,
|
memory_hints: wgpu::MemoryHints::Performance,
|
||||||
label: Some("RenderPass Device"),
|
label: Some("Compute/RenderPass Device"),
|
||||||
},
|
},
|
||||||
None,
|
None,
|
||||||
))
|
))
|
||||||
@ -61,6 +62,7 @@ impl DeviceState {
|
|||||||
|
|
||||||
criterion_main!(
|
criterion_main!(
|
||||||
renderpass::renderpass,
|
renderpass::renderpass,
|
||||||
|
computepass::computepass,
|
||||||
resource_creation::resource_creation,
|
resource_creation::resource_creation,
|
||||||
shader::shader
|
shader::shader
|
||||||
);
|
);
|
||||||
|
@ -66,7 +66,7 @@ pub enum CreateBindGroupLayoutError {
|
|||||||
},
|
},
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
TooManyBindings(BindingTypeMaxCountError),
|
TooManyBindings(BindingTypeMaxCountError),
|
||||||
#[error("Binding index {binding} is greater than the maximum index {maximum}")]
|
#[error("Binding index {binding} is greater than the maximum number {maximum}")]
|
||||||
InvalidBindingIndex { binding: u32, maximum: u32 },
|
InvalidBindingIndex { binding: u32, maximum: u32 },
|
||||||
#[error("Invalid visibility {0:?}")]
|
#[error("Invalid visibility {0:?}")]
|
||||||
InvalidVisibility(wgt::ShaderStages),
|
InvalidVisibility(wgt::ShaderStages),
|
||||||
|
Loading…
Reference in New Issue
Block a user