Inner Daemons ad0f3111b7
Add multiview limits and tests (#8206)
Co-authored-by: Andreas Reich <r_andreas2@web.de>
Co-authored-by: Magnus <85136135+SupaMaggie70Incorporated@users.noreply.github.com>
2025-11-01 12:08:15 +00:00

616 lines
22 KiB
Rust

use std::{
num::NonZeroU32,
time::{Duration, Instant},
};
use criterion::{criterion_group, Criterion, Throughput};
use nanorand::{Rng, WyRand};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::sync::LazyLock;
use crate::{is_test, DeviceState};
fn draw_count() -> usize {
// When testing we only want to run a very lightweight version of the benchmark
// to ensure that it does not break.
if is_test() {
8
} else {
10_000
}
}
fn thread_count_list() -> &'static [usize] {
if is_test() {
&[2]
} else {
&[1, 2, 4, 8]
}
}
// Must match the number of textures in the renderpass.wgsl shader
const TEXTURES_PER_DRAW: usize = 7;
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
struct RenderpassState {
device_state: DeviceState,
pipeline: wgpu::RenderPipeline,
bind_groups: Vec<wgpu::BindGroup>,
vertex_buffers: Vec<wgpu::Buffer>,
index_buffers: Vec<wgpu::Buffer>,
render_target: wgpu::TextureView,
// Bindless resources
bindless_bind_group: Option<wgpu::BindGroup>,
bindless_pipeline: Option<wgpu::RenderPipeline>,
}
impl RenderpassState {
/// Create and prepare all the resources needed for the renderpass benchmark.
fn new() -> Self {
let device_state = DeviceState::new();
let draw_count = draw_count();
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
let texture_count = draw_count * TEXTURES_PER_DRAW;
let supports_bindless = device_state.device.features().contains(
wgpu::Features::TEXTURE_BINDING_ARRAY
| wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
) && device_state
.device
.limits()
.max_sampled_textures_per_shader_stage
>= texture_count as _;
// Performance gets considerably worse if the resources are shuffled.
//
// This more closely matches the real-world use case where resources have no
// well defined usage order.
let mut random = WyRand::new_seed(0x8BADF00D);
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
for i in 0..TEXTURES_PER_DRAW {
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: None,
});
}
let bind_group_layout =
device_state
.device
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &bind_group_layout_entries,
});
let mut texture_views = Vec::with_capacity(texture_count);
for i in 0..texture_count {
let texture = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("Texture {i}")),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsages::TEXTURE_BINDING,
view_formats: &[],
});
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
label: Some(&format!("Texture View {i}")),
..Default::default()
}));
}
random.shuffle(&mut texture_views);
let texture_view_refs: Vec<_> = texture_views.iter().collect();
let mut bind_groups = Vec::with_capacity(draw_count);
for draw_idx in 0..draw_count {
let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
for tex_idx in 0..TEXTURES_PER_DRAW {
entries.push(wgpu::BindGroupEntry {
binding: tex_idx as u32,
resource: wgpu::BindingResource::TextureView(
&texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
),
});
}
bind_groups.push(
device_state
.device
.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &entries,
}),
);
}
random.shuffle(&mut bind_groups);
let sm = device_state
.device
.create_shader_module(wgpu::include_wgsl!("renderpass.wgsl"));
let pipeline_layout =
device_state
.device
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count);
for _ in 0..vertex_buffer_count {
vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size: 3 * 16,
usage: wgpu::BufferUsages::VERTEX,
mapped_at_creation: false,
}));
}
random.shuffle(&mut vertex_buffers);
let mut index_buffers = Vec::with_capacity(draw_count);
for _ in 0..draw_count {
index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size: 3 * 4,
usage: wgpu::BufferUsages::INDEX,
mapped_at_creation: false,
}));
}
random.shuffle(&mut index_buffers);
let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
}
let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
for attributes in &vertex_buffer_attributes {
vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
array_stride: 16,
step_mode: wgpu::VertexStepMode::Vertex,
attributes,
});
}
let pipeline =
device_state
.device
.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
label: None,
layout: Some(&pipeline_layout),
vertex: wgpu::VertexState {
module: &sm,
entry_point: Some("vs_main"),
buffers: &vertex_buffer_layouts,
compilation_options: wgpu::PipelineCompilationOptions::default(),
},
primitive: wgpu::PrimitiveState {
topology: wgpu::PrimitiveTopology::TriangleList,
strip_index_format: None,
front_face: wgpu::FrontFace::Cw,
cull_mode: Some(wgpu::Face::Back),
polygon_mode: wgpu::PolygonMode::Fill,
unclipped_depth: false,
conservative: false,
},
depth_stencil: None,
multisample: wgpu::MultisampleState::default(),
fragment: Some(wgpu::FragmentState {
module: &sm,
entry_point: Some("fs_main"),
targets: &[Some(wgpu::ColorTargetState {
format: wgpu::TextureFormat::Rgba8UnormSrgb,
blend: None,
write_mask: wgpu::ColorWrites::ALL,
})],
compilation_options: wgpu::PipelineCompilationOptions::default(),
}),
multiview_mask: None,
cache: None,
});
let render_target = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some("Render Target"),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
view_formats: &[],
})
.create_view(&wgpu::TextureViewDescriptor::default());
let mut bindless_bind_group = None;
let mut bindless_pipeline = None;
if supports_bindless {
let bindless_bind_group_layout =
device_state
.device
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &[wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: Some(NonZeroU32::new(texture_count as u32).unwrap()),
}],
});
bindless_bind_group = Some(device_state.device.create_bind_group(
&wgpu::BindGroupDescriptor {
label: None,
layout: &bindless_bind_group_layout,
entries: &[wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
}],
},
));
let bindless_shader_module = device_state
.device
.create_shader_module(wgpu::include_wgsl!("renderpass-bindless.wgsl"));
let bindless_pipeline_layout =
device_state
.device
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bindless_bind_group_layout],
push_constant_ranges: &[],
});
bindless_pipeline = Some(device_state.device.create_render_pipeline(
&wgpu::RenderPipelineDescriptor {
label: None,
layout: Some(&bindless_pipeline_layout),
vertex: wgpu::VertexState {
module: &bindless_shader_module,
entry_point: Some("vs_main"),
buffers: &vertex_buffer_layouts,
compilation_options: wgpu::PipelineCompilationOptions::default(),
},
primitive: wgpu::PrimitiveState {
topology: wgpu::PrimitiveTopology::TriangleList,
strip_index_format: None,
front_face: wgpu::FrontFace::Cw,
cull_mode: Some(wgpu::Face::Back),
polygon_mode: wgpu::PolygonMode::Fill,
unclipped_depth: false,
conservative: false,
},
depth_stencil: None,
multisample: wgpu::MultisampleState::default(),
fragment: Some(wgpu::FragmentState {
module: &bindless_shader_module,
entry_point: Some("fs_main"),
targets: &[Some(wgpu::ColorTargetState {
format: wgpu::TextureFormat::Rgba8UnormSrgb,
blend: None,
write_mask: wgpu::ColorWrites::ALL,
})],
compilation_options: wgpu::PipelineCompilationOptions::default(),
}),
multiview_mask: None,
cache: None,
},
));
}
Self {
device_state,
pipeline,
bind_groups,
vertex_buffers,
index_buffers,
render_target,
bindless_bind_group,
bindless_pipeline,
}
}
fn run_subpass(
&self,
pass_number: usize,
total_passes: usize,
draw_count: usize,
) -> wgpu::CommandBuffer {
profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
let draws_per_pass = draw_count / total_passes;
let mut encoder = self
.device_state
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: None,
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: &self.render_target,
depth_slice: None,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
store: wgpu::StoreOp::Store,
},
})],
occlusion_query_set: None,
timestamp_writes: None,
depth_stencil_attachment: None,
multiview_mask: None,
});
let start_idx = pass_number * draws_per_pass;
let end_idx = start_idx + draws_per_pass;
for draw_idx in start_idx..end_idx {
render_pass.set_pipeline(&self.pipeline);
render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
render_pass.set_vertex_buffer(
i as u32,
self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
);
}
render_pass.set_index_buffer(
self.index_buffers[draw_idx].slice(..),
wgpu::IndexFormat::Uint32,
);
render_pass.draw_indexed(0..3, 0, 0..1);
}
drop(render_pass);
encoder.finish()
}
fn run_bindless_pass(&self, draw_count: usize) -> wgpu::CommandBuffer {
profiling::scope!("Bindless Renderpass");
let mut encoder = self
.device_state
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: None,
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: &self.render_target,
depth_slice: None,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
store: wgpu::StoreOp::Store,
},
})],
occlusion_query_set: None,
timestamp_writes: None,
depth_stencil_attachment: None,
multiview_mask: None,
});
render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
render_pass.set_bind_group(0, Some(self.bindless_bind_group.as_ref().unwrap()), &[]);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
}
render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
for draw_idx in 0..draw_count {
render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
}
drop(render_pass);
encoder.finish()
}
}
fn run_bench(ctx: &mut Criterion) {
let state = LazyLock::new(RenderpassState::new);
let draw_count = draw_count();
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
let texture_count = draw_count * TEXTURES_PER_DRAW;
// Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
group.throughput(Throughput::Elements(draw_count as _));
for time_submit in [false, true] {
for &rpasses in thread_count_list() {
let draws_per_pass = draw_count / rpasses;
let label = if time_submit {
"Submit Time"
} else {
"Renderpass Time"
};
group.bench_function(
format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
|b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs(1);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let mut start = Instant::now();
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
for i in 0..rpasses {
buffers.push(state.run_subpass(i, rpasses, draw_count));
}
if time_submit {
start = Instant::now();
} else {
duration += start.elapsed();
}
state.device_state.queue.submit(buffers);
if time_submit {
duration += start.elapsed();
}
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
},
);
}
}
group.finish();
// Test 10k draw calls split up over 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
group.throughput(Throughput::Elements(draw_count as _));
for &threads in thread_count_list() {
let draws_per_pass = draw_count / threads;
group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(i, threads, draw_count))
.collect::<Vec<_>>();
duration += start.elapsed();
state.device_state.queue.submit(buffers);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
});
}
group.finish();
// Test 10k draw calls split up over 1, 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Renderpass: Bindless");
group.throughput(Throughput::Elements(draw_count as _));
group.bench_function(format!("{draw_count} draws"), |b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// Need bindless to run this benchmark
if state.bindless_bind_group.is_none() {
return Duration::from_secs_f32(1.0);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffer = state.run_bindless_pass(draw_count);
duration += start.elapsed();
state.device_state.queue.submit([buffer]);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
});
group.finish();
ctx.bench_function(
&format!(
"Renderpass: Empty Submit with {} Resources",
texture_count + vertex_buffer_count
),
|b| {
LazyLock::force(&state);
b.iter(|| state.device_state.queue.submit([]));
},
);
}
criterion_group! {
name = renderpass;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
}