diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index edf8501c5..70a83b51d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,7 @@ env: RUSTDOCFLAGS: -D warnings WASM_BINDGEN_TEST_TIMEOUT: 300 # 5 minutes CACHE_SUFFIX: c # cache busting + WGPU_TESTING: true # We distinguish the following kinds of builds: # - native: build for the same target as we compile on diff --git a/benches/benches/computepass.rs b/benches/benches/computepass.rs index 9a69eb46e..2af141360 100644 --- a/benches/benches/computepass.rs +++ b/benches/benches/computepass.rs @@ -10,24 +10,36 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use crate::DeviceState; -const DISPATCH_COUNT: usize = 10_000; +fn dispatch_count() -> usize { + // On CI we only want to run a very lightweight version of the benchmark + // to ensure that it does not break. + if std::env::var("WGPU_TESTING").is_ok() { + 8 + } else { + 10_000 + } +} // Currently bindless is _much_ slower than with regularly resources, // since wgpu needs to issues barriers for all resources between each dispatch for all read/write textures & buffers. // This is in fact so slow that it makes the benchmark unusable when we use the same amount of // resources as the regular benchmark. // For details see https://github.com/gfx-rs/wgpu/issues/5766 -const DISPATCH_COUNT_BINDLESS: usize = 1_000; +fn dispatch_count_bindless() -> usize { + // On CI we only want to run a very lightweight version of the benchmark + // to ensure that it does not break. + if std::env::var("WGPU_TESTING").is_ok() { + 8 + } else { + 1_000 + } +} // Must match the number of textures in the computepass.wgsl shader const TEXTURES_PER_DISPATCH: usize = 2; const STORAGE_TEXTURES_PER_DISPATCH: usize = 2; const STORAGE_BUFFERS_PER_DISPATCH: usize = 2; -const TEXTURE_COUNT: usize = DISPATCH_COUNT * TEXTURES_PER_DISPATCH; -const STORAGE_TEXTURE_COUNT: usize = DISPATCH_COUNT * STORAGE_TEXTURES_PER_DISPATCH; -const STORAGE_BUFFER_COUNT: usize = DISPATCH_COUNT * STORAGE_BUFFERS_PER_DISPATCH; - const BUFFER_SIZE: u64 = 16; struct ComputepassState { @@ -45,6 +57,12 @@ impl ComputepassState { fn new() -> Self { let device_state = DeviceState::new(); + let dispatch_count = dispatch_count(); + let dispatch_count_bindless = dispatch_count_bindless(); + let texture_count = dispatch_count * TEXTURES_PER_DISPATCH; + let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH; + let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH; + let supports_bindless = device_state.device.features().contains( wgpu::Features::BUFFER_BINDING_ARRAY | wgpu::Features::TEXTURE_BINDING_ARRAY @@ -106,8 +124,8 @@ impl ComputepassState { entries: &bind_group_layout_entries, }); - let mut texture_views = Vec::with_capacity(TEXTURE_COUNT); - for i in 0..TEXTURE_COUNT { + let mut texture_views = Vec::with_capacity(texture_count); + for i in 0..texture_count { let texture = device_state .device .create_texture(&wgpu::TextureDescriptor { @@ -132,8 +150,8 @@ impl ComputepassState { random.shuffle(&mut texture_views); let texture_view_refs: Vec<_> = texture_views.iter().collect(); - let mut storage_texture_views = Vec::with_capacity(STORAGE_TEXTURE_COUNT); - for i in 0..TEXTURE_COUNT { + let mut storage_texture_views = Vec::with_capacity(storage_texture_count); + for i in 0..storage_texture_count { let texture = device_state .device .create_texture(&wgpu::TextureDescriptor { @@ -158,8 +176,8 @@ impl ComputepassState { random.shuffle(&mut storage_texture_views); let storage_texture_view_refs: Vec<_> = storage_texture_views.iter().collect(); - let mut storage_buffers = Vec::with_capacity(STORAGE_BUFFER_COUNT); - for i in 0..STORAGE_BUFFER_COUNT { + let mut storage_buffers = Vec::with_capacity(storage_buffer_count); + for i in 0..storage_buffer_count { storage_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor { label: Some(&format!("Buffer {i}")), size: BUFFER_SIZE, @@ -173,8 +191,8 @@ impl ComputepassState { .map(|b| b.as_entire_buffer_binding()) .collect(); - let mut bind_groups = Vec::with_capacity(DISPATCH_COUNT); - for dispatch_idx in 0..DISPATCH_COUNT { + let mut bind_groups = Vec::with_capacity(dispatch_count); + for dispatch_idx in 0..dispatch_count { let mut entries = Vec::with_capacity(TEXTURES_PER_DISPATCH); for tex_idx in 0..TEXTURES_PER_DISPATCH { entries.push(wgpu::BindGroupEntry { @@ -258,7 +276,7 @@ impl ComputepassState { view_dimension: wgpu::TextureViewDimension::D2, multisampled: false, }, - count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()), + count: Some(NonZeroU32::new(texture_count as u32).unwrap()), }, wgpu::BindGroupLayoutEntry { binding: 1, @@ -268,7 +286,7 @@ impl ComputepassState { format: wgpu::TextureFormat::R32Float, view_dimension: wgpu::TextureViewDimension::D2, }, - count: Some(NonZeroU32::new(STORAGE_TEXTURE_COUNT as u32).unwrap()), + count: Some(NonZeroU32::new(storage_texture_count as u32).unwrap()), }, wgpu::BindGroupLayoutEntry { binding: 2, @@ -278,7 +296,7 @@ impl ComputepassState { has_dynamic_offset: false, min_binding_size: std::num::NonZeroU64::new(BUFFER_SIZE), }, - count: Some(NonZeroU32::new(STORAGE_BUFFER_COUNT as u32).unwrap()), + count: Some(NonZeroU32::new(storage_buffer_count as u32).unwrap()), }, ], }); @@ -293,19 +311,19 @@ impl ComputepassState { wgpu::BindGroupEntry { binding: 0, resource: wgpu::BindingResource::TextureViewArray( - &texture_view_refs[..DISPATCH_COUNT_BINDLESS], + &texture_view_refs[..dispatch_count_bindless], ), }, wgpu::BindGroupEntry { binding: 1, resource: wgpu::BindingResource::TextureViewArray( - &storage_texture_view_refs[..DISPATCH_COUNT_BINDLESS], + &storage_texture_view_refs[..dispatch_count_bindless], ), }, wgpu::BindGroupEntry { binding: 2, resource: wgpu::BindingResource::BufferArray( - &storage_buffer_bindings[..DISPATCH_COUNT_BINDLESS], + &storage_buffer_bindings[..dispatch_count_bindless], ), }, ], @@ -354,7 +372,8 @@ impl ComputepassState { fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer { profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}")); - let dispatch_per_pass = DISPATCH_COUNT / total_passes; + let dispatch_count = dispatch_count(); + let dispatch_per_pass = dispatch_count / total_passes; let mut encoder = self .device_state @@ -379,7 +398,7 @@ impl ComputepassState { encoder.finish() } - fn run_bindless_pass(&self) -> wgpu::CommandBuffer { + fn run_bindless_pass(&self, dispatch_count_bindless: usize) -> wgpu::CommandBuffer { profiling::scope!("Bindless Computepass"); let mut encoder = self @@ -394,7 +413,7 @@ impl ComputepassState { compute_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap()); compute_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]); - for _ in 0..DISPATCH_COUNT_BINDLESS { + for _ in 0..dispatch_count_bindless { compute_pass.dispatch_workgroups(1, 1, 1); } @@ -407,13 +426,19 @@ impl ComputepassState { fn run_bench(ctx: &mut Criterion) { let state = Lazy::new(ComputepassState::new); + let dispatch_count = dispatch_count(); + let dispatch_count_bindless = dispatch_count_bindless(); + let texture_count = dispatch_count * TEXTURES_PER_DISPATCH; + let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH; + let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH; + // Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses let mut group = ctx.benchmark_group("Computepass: Single Threaded"); - group.throughput(Throughput::Elements(DISPATCH_COUNT as _)); + group.throughput(Throughput::Elements(dispatch_count as _)); for time_submit in [false, true] { for cpasses in [1, 2, 4, 8] { - let dispatch_per_pass = DISPATCH_COUNT / cpasses; + let dispatch_per_pass = dispatch_count / cpasses; let label = if time_submit { "Submit Time" @@ -466,10 +491,10 @@ fn run_bench(ctx: &mut Criterion) { // Test 10k dispatch calls split up over 2, 4, and 8 threads. let mut group = ctx.benchmark_group("Computepass: Multi Threaded"); - group.throughput(Throughput::Elements(DISPATCH_COUNT as _)); + group.throughput(Throughput::Elements(dispatch_count as _)); for threads in [2, 4, 8] { - let dispatch_per_pass = DISPATCH_COUNT / threads; + let dispatch_per_pass = dispatch_count / threads; group.bench_function( &format!("{threads} threads x {dispatch_per_pass} dispatch"), |b| { @@ -510,9 +535,9 @@ fn run_bench(ctx: &mut Criterion) { // Test 10k dispatch calls split up over 1, 2, 4, and 8 threads. let mut group = ctx.benchmark_group("Computepass: Bindless"); - group.throughput(Throughput::Elements(DISPATCH_COUNT_BINDLESS as _)); + group.throughput(Throughput::Elements(dispatch_count_bindless as _)); - group.bench_function(&format!("{DISPATCH_COUNT_BINDLESS} dispatch"), |b| { + group.bench_function(&format!("{dispatch_count_bindless} dispatch"), |b| { Lazy::force(&state); b.iter_custom(|iters| { @@ -535,7 +560,7 @@ fn run_bench(ctx: &mut Criterion) { let start = Instant::now(); - let buffer = state.run_bindless_pass(); + let buffer = state.run_bindless_pass(dispatch_count_bindless); duration += start.elapsed(); @@ -551,7 +576,7 @@ fn run_bench(ctx: &mut Criterion) { ctx.bench_function( &format!( "Computepass: Empty Submit with {} Resources", - TEXTURE_COUNT + STORAGE_TEXTURE_COUNT + STORAGE_BUFFER_COUNT + texture_count + storage_texture_count + storage_buffer_count ), |b| { Lazy::force(&state); diff --git a/benches/benches/renderpass.rs b/benches/benches/renderpass.rs index f31fc0758..7f2e14116 100644 --- a/benches/benches/renderpass.rs +++ b/benches/benches/renderpass.rs @@ -10,14 +10,19 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use crate::DeviceState; -const DRAW_COUNT: usize = 10_000; +fn draw_count() -> usize { + // On CI we only want to run a very lightweight version of the benchmark + // to ensure that it does not break. + if std::env::var("WGPU_TESTING").is_ok() { + 8 + } else { + 10_000 + } +} // Must match the number of textures in the renderpass.wgsl shader const TEXTURES_PER_DRAW: usize = 7; const VERTEX_BUFFERS_PER_DRAW: usize = 2; -const VERTEX_BUFFER_COUNT: usize = DRAW_COUNT * VERTEX_BUFFERS_PER_DRAW; - -const TEXTURE_COUNT: usize = DRAW_COUNT * TEXTURES_PER_DRAW; struct RenderpassState { device_state: DeviceState, @@ -37,6 +42,10 @@ impl RenderpassState { fn new() -> Self { let device_state = DeviceState::new(); + let draw_count = draw_count(); + let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW; + let texture_count = draw_count * TEXTURES_PER_DRAW; + let supports_bindless = device_state.device.features().contains( wgpu::Features::TEXTURE_BINDING_ARRAY | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING, @@ -44,7 +53,7 @@ impl RenderpassState { .device .limits() .max_sampled_textures_per_shader_stage - >= TEXTURE_COUNT as _; + >= texture_count as _; // Performance gets considerably worse if the resources are shuffled. // @@ -74,8 +83,8 @@ impl RenderpassState { entries: &bind_group_layout_entries, }); - let mut texture_views = Vec::with_capacity(TEXTURE_COUNT); - for i in 0..TEXTURE_COUNT { + let mut texture_views = Vec::with_capacity(texture_count); + for i in 0..texture_count { let texture = device_state .device .create_texture(&wgpu::TextureDescriptor { @@ -101,8 +110,8 @@ impl RenderpassState { let texture_view_refs: Vec<_> = texture_views.iter().collect(); - let mut bind_groups = Vec::with_capacity(DRAW_COUNT); - for draw_idx in 0..DRAW_COUNT { + let mut bind_groups = Vec::with_capacity(draw_count); + for draw_idx in 0..draw_count { let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW); for tex_idx in 0..TEXTURES_PER_DRAW { entries.push(wgpu::BindGroupEntry { @@ -138,8 +147,8 @@ impl RenderpassState { push_constant_ranges: &[], }); - let mut vertex_buffers = Vec::with_capacity(VERTEX_BUFFER_COUNT); - for _ in 0..VERTEX_BUFFER_COUNT { + let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count); + for _ in 0..vertex_buffer_count { vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor { label: None, size: 3 * 16, @@ -149,8 +158,8 @@ impl RenderpassState { } random.shuffle(&mut vertex_buffers); - let mut index_buffers = Vec::with_capacity(DRAW_COUNT); - for _ in 0..DRAW_COUNT { + let mut index_buffers = Vec::with_capacity(draw_count); + for _ in 0..draw_count { index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor { label: None, size: 3 * 4, @@ -246,7 +255,7 @@ impl RenderpassState { view_dimension: wgpu::TextureViewDimension::D2, multisampled: false, }, - count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()), + count: Some(NonZeroU32::new(texture_count as u32).unwrap()), }], }); @@ -324,10 +333,15 @@ impl RenderpassState { } } - fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer { + fn run_subpass( + &self, + pass_number: usize, + total_passes: usize, + draw_count: usize, + ) -> wgpu::CommandBuffer { profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}")); - let draws_per_pass = DRAW_COUNT / total_passes; + let draws_per_pass = draw_count / total_passes; let mut encoder = self .device_state @@ -372,7 +386,7 @@ impl RenderpassState { encoder.finish() } - fn run_bindless_pass(&self) -> wgpu::CommandBuffer { + fn run_bindless_pass(&self, draw_count: usize) -> wgpu::CommandBuffer { profiling::scope!("Bindless Renderpass"); let mut encoder = self @@ -402,7 +416,7 @@ impl RenderpassState { } render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32); - for draw_idx in 0..DRAW_COUNT { + for draw_idx in 0..draw_count { render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1); } @@ -415,13 +429,17 @@ impl RenderpassState { fn run_bench(ctx: &mut Criterion) { let state = Lazy::new(RenderpassState::new); + let draw_count = draw_count(); + let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW; + let texture_count = draw_count * TEXTURES_PER_DRAW; + // Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses let mut group = ctx.benchmark_group("Renderpass: Single Threaded"); - group.throughput(Throughput::Elements(DRAW_COUNT as _)); + group.throughput(Throughput::Elements(draw_count as _)); for time_submit in [false, true] { for rpasses in [1, 2, 4, 8] { - let draws_per_pass = DRAW_COUNT / rpasses; + let draws_per_pass = draw_count / rpasses; let label = if time_submit { "Submit Time" @@ -451,7 +469,7 @@ fn run_bench(ctx: &mut Criterion) { let mut buffers: Vec = Vec::with_capacity(rpasses); for i in 0..rpasses { - buffers.push(state.run_subpass(i, rpasses)); + buffers.push(state.run_subpass(i, rpasses, draw_count)); } if time_submit { @@ -479,10 +497,10 @@ fn run_bench(ctx: &mut Criterion) { // Test 10k draw calls split up over 2, 4, and 8 threads. let mut group = ctx.benchmark_group("Renderpass: Multi Threaded"); - group.throughput(Throughput::Elements(DRAW_COUNT as _)); + group.throughput(Throughput::Elements(draw_count as _)); for threads in [2, 4, 8] { - let draws_per_pass = DRAW_COUNT / threads; + let draws_per_pass = draw_count / threads; group.bench_function( &format!("{threads} threads x {draws_per_pass} draws"), |b| { @@ -505,7 +523,7 @@ fn run_bench(ctx: &mut Criterion) { let buffers = (0..threads) .into_par_iter() - .map(|i| state.run_subpass(i, threads)) + .map(|i| state.run_subpass(i, threads, draw_count)) .collect::>(); duration += start.elapsed(); @@ -523,9 +541,9 @@ fn run_bench(ctx: &mut Criterion) { // Test 10k draw calls split up over 1, 2, 4, and 8 threads. let mut group = ctx.benchmark_group("Renderpass: Bindless"); - group.throughput(Throughput::Elements(DRAW_COUNT as _)); + group.throughput(Throughput::Elements(draw_count as _)); - group.bench_function(&format!("{DRAW_COUNT} draws"), |b| { + group.bench_function(&format!("{draw_count} draws"), |b| { Lazy::force(&state); b.iter_custom(|iters| { @@ -543,7 +561,7 @@ fn run_bench(ctx: &mut Criterion) { let start = Instant::now(); - let buffer = state.run_bindless_pass(); + let buffer = state.run_bindless_pass(draw_count); duration += start.elapsed(); @@ -559,7 +577,7 @@ fn run_bench(ctx: &mut Criterion) { ctx.bench_function( &format!( "Renderpass: Empty Submit with {} Resources", - TEXTURE_COUNT + VERTEX_BUFFER_COUNT + texture_count + vertex_buffer_count ), |b| { Lazy::force(&state);