Setup and use new benchmarking harness (#8511)

This commit is contained in:
Connor Fitzgerald 2025-11-18 15:28:21 -05:00 committed by GitHub
parent 6043b059c4
commit 853ad6c464
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 1313 additions and 926 deletions

122
Cargo.lock generated
View File

@ -118,12 +118,6 @@ dependencies = [
"libc",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstream"
version = "0.6.21"
@ -592,12 +586,6 @@ dependencies = [
"thiserror 2.0.17",
]
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cc"
version = "1.2.46"
@ -652,33 +640,6 @@ dependencies = [
"libc",
]
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "clang-sys"
version = "1.8.1"
@ -884,39 +845,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "criterion"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"itertools 0.13.0",
"num-traits",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
dependencies = [
"cast",
"itertools 0.13.0",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
@ -3022,12 +2950,6 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "oorandom"
version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "orbclient"
version = "0.3.49"
@ -3185,34 +3107,6 @@ dependencies = [
"winit 0.29.15",
]
[[package]]
name = "plotters"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
[[package]]
name = "plotters-svg"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
dependencies = [
"plotters-backend",
]
[[package]]
name = "png"
version = "0.18.0"
@ -4112,16 +4006,6 @@ dependencies = [
"zerovec",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "tokio"
version = "1.48.0"
@ -4800,15 +4684,19 @@ dependencies = [
name = "wgpu-benchmark"
version = "27.0.0"
dependencies = [
"anyhow",
"bincode 2.0.1",
"bytemuck",
"criterion",
"naga",
"naga-test",
"nanorand 0.8.0",
"pico-args",
"pollster",
"profiling",
"rayon",
"serde",
"serde_json",
"termcolor",
"tracy-client",
"wgpu",
]

View File

@ -250,7 +250,7 @@ deno_webgpu = { version = "0.181.0", path = "./deno_webgpu" }
deno_unsync = "0.4.4"
deno_error = "0.7.0"
tokio = "1.47"
termcolor = "1.1.3"
termcolor = "1.4.1"
# android dependencies
ndk-sys = "0.6"

View File

@ -16,19 +16,18 @@ name = "wgpu-benchmark"
harness = false
[features]
# Uncomment these features to enable tracy and superluminal profiling.
# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
# superluminal = ["profiling/profile-with-superluminal"]
tracy = ["dep:tracy-client"]
[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = [
'cfg(feature, values("tracy"))',
] }
[dependencies]
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
anyhow.workspace = true
bincode = { workspace = true, features = ["serde"] }
bytemuck.workspace = true
criterion.workspace = true
# criterion.workspace = true
naga = { workspace = true, features = [
"deserialize",
"serialize",
@ -43,8 +42,12 @@ naga = { workspace = true, features = [
] }
naga-test = { workspace = true, features = [] }
nanorand.workspace = true
pico-args.workspace = true
pollster.workspace = true
profiling.workspace = true
rayon.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
termcolor.workspace = true
tracy-client = { workspace = true, optional = true }
wgpu.workspace = true

View File

@ -1,9 +1,6 @@
Collection of CPU benchmarks for `wgpu`.
These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
They all do very little GPU work and are testing the CPU performance of the API.
Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
## Usage
@ -14,65 +11,30 @@ cargo bench -p wgpu-benchmark
cargo bench -p wgpu-benchmark -- "filter"
```
## Benchmarks
#### `Renderpass`
This benchmark measures the performance of recording and submitting a render pass with a large
number of draw calls and resources, emulating an intense, more traditional graphics application.
By default it measures 10k draw calls, with 90k total resources.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the render pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all textures at once instead of switching
the bind group for every draw call.
#### `Computepass`
This benchmark measures the performance of recording and submitting a compute pass with a large
number of dispatches and resources.
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the compute pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all resources at once instead of switching
the bind group for every draw call.
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
#### `Resource Creation`
This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
#### `Shader Compilation`
This benchmark measures the performance of naga parsing, validating, and generating shaders.
Use `WGPU_BACKEND` and `WGPU_ADAPTER_NAME` to adjust which device the benchmarks use. [More info on env vars](../README.md#environment-variables).
## Comparing Against a Baseline
To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
For example, to compare v0.20 against trunk, you could run the following:
For example, to compare v28 against trunk, you could run the following:
```sh
git checkout v0.20
git checkout v28
# Run the baseline benchmarks
cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
cargo bench -p wgpu-benchmark -- --save-baseline "v28"
git checkout trunk
# Run the current benchmarks
cargo bench -p wgpu-benchmark -- --baseline "v0.20"
cargo bench -p wgpu-benchmark -- --baseline "v28"
```
You can use this for any bits of code you want to compare.
The current benchmarking framework was added before v28, so comparisons only work after it was added. Before that the same commands will work, but comparison will be done using `criterion`.
## Integration with Profilers
The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
you need to uncomment the features in the `Cargo.toml` to allow features to be used.
Integrations are available for `tracy` and `superluminal`.
#### Tracy
@ -80,7 +42,7 @@ Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/t
```sh
# Once this is running, you can connect to it with the Tracy Profiler
cargo bench -p wgpu-benchmark --features tracy
cargo bench -p wgpu-benchmark --features tracy,profiling/profile-with-tracy
```
#### Superluminal
@ -89,10 +51,10 @@ Superluminal is a paid product for windows available [here](https://superluminal
```sh
# This command will build the benchmarks, and display the path to the executable
cargo bench -p wgpu-benchmark --features superluminal -- -h
cargo bench -p wgpu-benchmark --features profiling/profile-with-superluminal -- -h
# Have Superluminal run the following command (replacing with the path to the executable)
./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
<path_to_exe> --bench "filter"
```
#### `perf` and others
@ -105,6 +67,42 @@ For example, the command line tool `perf` can be used to profile the benchmarks.
cargo bench -p wgpu-benchmark -- -h
# Run the benchmarks with perf
perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
perf record <path_to_exe> --bench "filter"
```
## Benchmarks
#### `Renderpass Encoding`
This benchmark measures the performance of recording and submitting a render pass with a large
number of draw calls and resources, emulating an intense, more traditional graphics application.
By default it measures 10k draw calls, with 90k total resources.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the render pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all textures at once instead of switching
the bind group for every draw call.
#### `Computepass Encoding`
This benchmark measures the performance of recording and submitting a compute pass with a large
number of dispatches and resources.
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
the compute pass into multiple passes over multiple command buffers.
If available, it also tests a bindless approach, binding all resources at once instead of switching
the bind group for every draw call.
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
#### `Device::create_buffer`
This benchmark measures the performance of creating large buffers.
#### `Device::create_bind_group`
This benchmark measures the performance of creating large bind groups of 5 to 50,000 resources.
#### `naga::back`, `naga::compact`, `naga::front`, and `naga::valid`
These benchmark measures the performance of naga parsing, validating, and generating shaders.

View File

@ -1,173 +1,127 @@
use std::{
num::NonZeroU32,
time::{Duration, Instant},
};
use std::{num::NonZeroU32, time::Instant};
use criterion::{criterion_group, Criterion, Throughput};
use nanorand::{Rng, WyRand};
use std::sync::LazyLock;
use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};
use crate::{is_test, DeviceState};
use crate::DeviceState;
struct Params {
max_texture_count: u32,
texture_counts: &'static [u32],
}
// Creating 50_000 textures takes a considerable amount of time with syncval enabled.
//
// We greatly reduce the number of textures for the test case to keep the runtime
// reasonable for testing.
const MAX_TEXTURE_COUNT_BENCHMARK: u32 = 50_000;
const TEXTURE_COUNTS_BENCHMARK: &[u32] = &[5, 50, 500, 5_000, 50_000];
const BENCHMARK_PARAMS: Params = Params {
max_texture_count: 50_000,
texture_counts: &[5, 50, 500, 5_000, 50_000],
};
const MAX_TEXTURE_COUNT_TEST: u32 = 5;
const TEXTURE_COUNTS_TEST: &[u32] = &[5];
const TEST_PARAMS: Params = Params {
max_texture_count: 5,
texture_counts: &[5],
};
struct BindGroupState {
device_state: DeviceState,
texture_views: Vec<wgpu::TextureView>,
}
pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
let device_state = DeviceState::new();
impl BindGroupState {
/// Create and prepare all the resources needed for the renderpass benchmark.
fn new() -> Self {
let device_state = DeviceState::new();
let texture_count = if is_test() {
MAX_TEXTURE_COUNT_TEST
} else {
MAX_TEXTURE_COUNT_BENCHMARK
};
// Performance gets considerably worse if the resources are shuffled.
//
// This more closely matches the real-world use case where resources have no
// well defined usage order.
let mut random = WyRand::new_seed(0x8BADF00D);
let mut texture_views = Vec::with_capacity(texture_count as usize);
for i in 0..texture_count {
let texture = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("Texture {i}")),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsages::TEXTURE_BINDING,
view_formats: &[],
});
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
label: Some(&format!("Texture View {i}")),
..Default::default()
}));
}
random.shuffle(&mut texture_views);
Self {
device_state,
texture_views,
}
if !device_state
.device
.features()
.contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
{
anyhow::bail!("Device does not support required feature TEXTURE_BINDING_ARRAY");
}
}
fn run_bench(ctx: &mut Criterion) {
let state = LazyLock::new(BindGroupState::new);
let mut group = ctx.benchmark_group("Bind Group Creation");
let count_list = if is_test() {
TEXTURE_COUNTS_TEST
let params = if ctx.is_test() {
TEST_PARAMS
} else {
TEXTURE_COUNTS_BENCHMARK
BENCHMARK_PARAMS
};
for &count in count_list {
group.throughput(Throughput::Elements(count as u64));
group.bench_with_input(
format!("{count} Element Bind Group"),
&count,
|b, &count| {
b.iter_custom(|iters| {
if !state
.device_state
.device
.features()
.contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
{
return Duration::ZERO;
}
// Performance gets considerably worse if the resources are shuffled.
//
// This more closely matches the real-world use case where resources have no
// well defined usage order.
let mut random = WyRand::new_seed(0x8BADF00D);
if count
> state
.device_state
.device
.limits()
.max_sampled_textures_per_shader_stage
{
return Duration::ZERO;
}
let bind_group_layout = state.device_state.device.create_bind_group_layout(
&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &[wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float {
filterable: true,
},
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: Some(NonZeroU32::new(count).unwrap()),
}],
},
);
let texture_view_refs: Vec<_> =
state.texture_views.iter().take(count as usize).collect();
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let bind_group = state.device_state.device.create_bind_group(
&wgpu::BindGroupDescriptor {
layout: &bind_group_layout,
entries: &[wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureViewArray(
&texture_view_refs,
),
}],
label: None,
},
);
duration += start.elapsed();
drop(bind_group);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
});
},
);
let mut texture_views = Vec::with_capacity(params.max_texture_count as usize);
for i in 0..params.max_texture_count {
let texture = device_state
.device
.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("Texture {i}")),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsages::TEXTURE_BINDING,
view_formats: &[],
});
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
label: Some(&format!("Texture View {i}")),
..Default::default()
}));
}
}
random.shuffle(&mut texture_views);
criterion_group! {
name = bind_groups;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
let mut results = Vec::new();
for &count in params.texture_counts {
let bind_group_layout =
device_state
.device
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: None,
entries: &[wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: Some(NonZeroU32::new(count).unwrap()),
}],
});
let texture_view_refs: Vec<_> = texture_views.iter().take(count as usize).collect();
let name = format!("{count} Textures");
let res = iter(&ctx, &name, "bindings", count, || {
let start = Instant::now();
let bind_group = device_state
.device
.create_bind_group(&wgpu::BindGroupDescriptor {
layout: &bind_group_layout,
entries: &[wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
}],
label: None,
});
let time = start.elapsed();
drop(bind_group);
device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
time
});
results.push(res);
}
Ok(results)
}

View File

@ -3,17 +3,16 @@ use std::{
time::{Duration, Instant},
};
use criterion::{criterion_group, Criterion, Throughput};
use nanorand::{Rng, WyRand};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::sync::LazyLock;
use wgpu_benchmark::{iter_auto, iter_many, BenchmarkContext, LoopControl};
use crate::{is_test, DeviceState};
use crate::DeviceState;
fn dispatch_count() -> usize {
fn dispatch_count(ctx: &BenchmarkContext) -> usize {
// When testing we only want to run a very lightweight version of the benchmark
// to ensure that it does not break.
if is_test() {
if ctx.is_test() {
8
} else {
10_000
@ -25,18 +24,18 @@ fn dispatch_count() -> usize {
// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
// resources as the regular benchmark.
// For details see https://github.com/gfx-rs/wgpu/issues/5766
fn dispatch_count_bindless() -> usize {
fn dispatch_count_bindless(ctx: &BenchmarkContext) -> usize {
// On CI we only want to run a very lightweight version of the benchmark
// to ensure that it does not break.
if is_test() {
if ctx.is_test() {
8
} else {
1_000
}
}
fn thread_count_list() -> &'static [usize] {
if is_test() {
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
if ctx.is_test() {
&[2]
} else {
&[2, 4, 8]
@ -62,11 +61,11 @@ struct ComputepassState {
impl ComputepassState {
/// Create and prepare all the resources needed for the computepass benchmark.
fn new() -> Self {
fn new(ctx: &BenchmarkContext) -> Self {
let device_state = DeviceState::new();
let dispatch_count = dispatch_count();
let dispatch_count_bindless = dispatch_count_bindless();
let dispatch_count = dispatch_count(ctx);
let dispatch_count_bindless = dispatch_count_bindless(ctx);
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
@ -377,10 +376,15 @@ impl ComputepassState {
}
}
fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
fn run_subpass(
&self,
ctx: &BenchmarkContext,
pass_number: usize,
total_passes: usize,
) -> wgpu::CommandBuffer {
profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
let dispatch_count = dispatch_count();
let dispatch_count = dispatch_count(ctx);
let dispatch_per_pass = dispatch_count / total_passes;
let mut encoder = self
@ -431,183 +435,140 @@ impl ComputepassState {
}
}
fn run_bench(ctx: &mut Criterion) {
let state = LazyLock::new(ComputepassState::new);
pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
let state = ComputepassState::new(&ctx);
let dispatch_count = dispatch_count();
let dispatch_count_bindless = dispatch_count_bindless();
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
}
let dispatch_count = dispatch_count(&ctx);
let dispatch_count_bindless = dispatch_count_bindless(&ctx);
let mut results = Vec::new();
// Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
let mut group = ctx.benchmark_group("Computepass: Single Threaded");
group.throughput(Throughput::Elements(dispatch_count as _));
for &cpasses in thread_count_list(&ctx) {
let labels = vec![
format!("Encoding ({cpasses} passes)"),
format!("Submit ({cpasses} passes)"),
];
for time_submit in [false, true] {
for &cpasses in thread_count_list() {
let dispatch_per_pass = dispatch_count / cpasses;
results.extend(iter_many(
&ctx,
labels,
"dispatches",
dispatch_count as _,
|| {
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
let encoding_start = Instant::now();
for i in 0..cpasses {
buffers.push(state.run_subpass(&ctx, i, cpasses));
}
let encoding_duration = encoding_start.elapsed();
let label = if time_submit {
"Submit Time"
} else {
"Computepass Time"
};
let submit_start = Instant::now();
state.device_state.queue.submit(buffers);
let submit_duration = submit_start.elapsed();
group.bench_function(
format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
|b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let mut start = Instant::now();
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
for i in 0..cpasses {
buffers.push(state.run_subpass(i, cpasses));
}
if time_submit {
start = Instant::now();
} else {
duration += start.elapsed();
}
state.device_state.queue.submit(buffers);
if time_submit {
duration += start.elapsed();
}
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
},
);
}
}
group.finish();
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
group.throughput(Throughput::Elements(dispatch_count as _));
for &threads in thread_count_list() {
let dispatch_per_pass = dispatch_count / threads;
group.bench_function(
format!("{threads} threads x {dispatch_per_pass} dispatch"),
|b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(i, threads))
.collect::<Vec<_>>();
duration += start.elapsed();
state.device_state.queue.submit(buffers);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
},
);
}
group.finish();
// Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Computepass: Bindless");
group.throughput(Throughput::Elements(dispatch_count_bindless as _));
group.bench_function(format!("{dispatch_count_bindless} dispatch"), |b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
// Need bindless to run this benchmark
if state.bindless_bind_group.is_none() {
return Duration::from_secs(1);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffer = state.run_bindless_pass(dispatch_count_bindless);
duration += start.elapsed();
state.device_state.queue.submit([buffer]);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
});
group.finish();
vec![encoding_duration, submit_duration]
},
));
}
ctx.bench_function(
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
for &threads in thread_count_list(&ctx) {
let labels = vec![
format!("Encoding ({threads} threads)"),
format!("Submit ({threads} threads)"),
];
results.extend(iter_many(
&ctx,
labels,
"dispatches",
dispatch_count as _,
|| {
let encoding_start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(&ctx, i, threads))
.collect::<Vec<_>>();
let encoding_duration = encoding_start.elapsed();
let submit_start = Instant::now();
state.device_state.queue.submit(buffers);
let submit_duration = submit_start.elapsed();
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
vec![encoding_duration, submit_duration]
},
));
}
// Test 10k dispatch calls with bindless rendering.
if state.bindless_bind_group.is_some() {
let labels = vec![
"Encoding (bindless)".to_string(),
"Submit (bindless)".to_string(),
];
results.extend(iter_many(
&ctx,
labels,
"dispatches",
dispatch_count_bindless as _,
|| {
let encoding_start = Instant::now();
let buffer = state.run_bindless_pass(dispatch_count_bindless);
let encoding_duration = encoding_start.elapsed();
let submit_start = Instant::now();
state.device_state.queue.submit([buffer]);
let submit_duration = submit_start.elapsed();
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
vec![encoding_duration, submit_duration]
},
));
}
// Test empty submit overhead with all resources
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
results.push(iter_auto(
&ctx,
&format!(
"Computepass: Empty Submit with {} Resources",
"Empty Submit with {} Resources",
texture_count + storage_texture_count + storage_buffer_count
),
|b| {
LazyLock::force(&state);
b.iter(|| state.device_state.queue.submit([]));
"submits",
1,
|| {
state.device_state.queue.submit([]);
},
);
}
));
criterion_group! {
name = computepass;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
Ok(results)
}

View File

@ -1,5 +1,7 @@
use criterion::criterion_main;
#[cfg_attr(target_arch = "wasm32", no_main)]
#[cfg(not(target_arch = "wasm32"))]
use pollster::block_on;
use wgpu_benchmark::Benchmark;
mod bind_groups;
mod computepass;
@ -7,10 +9,6 @@ mod renderpass;
mod resource_creation;
mod shader;
fn is_test() -> bool {
std::env::var("NEXTEST").is_ok()
}
struct DeviceState {
adapter_info: wgpu::AdapterInfo,
device: wgpu::Device,
@ -41,14 +39,17 @@ impl DeviceState {
let adapter_info = adapter.get_info();
eprintln!("{adapter_info:?}");
println!(
" Using adapter: {} ({:?})",
adapter_info.name, adapter_info.backend
);
let (device, queue) = block_on(adapter.request_device(&wgpu::DeviceDescriptor {
required_features: adapter.features(),
required_limits: adapter.limits(),
memory_hints: wgpu::MemoryHints::Performance,
experimental_features: unsafe { wgpu::ExperimentalFeatures::enabled() },
label: Some("Compute/RenderPass Device"),
label: None,
trace: wgpu::Trace::Off,
}))
.unwrap();
@ -61,10 +62,41 @@ impl DeviceState {
}
}
criterion_main!(
bind_groups::bind_groups,
renderpass::renderpass,
computepass::computepass,
resource_creation::resource_creation,
shader::shader
);
fn main() {
let benchmarks = vec![
Benchmark {
name: "Device::create_bind_group",
func: bind_groups::run_bench,
},
Benchmark {
name: "Device::create_buffer",
func: resource_creation::run_bench,
},
Benchmark {
name: "naga::front",
func: shader::frontends,
},
Benchmark {
name: "naga::valid",
func: shader::validation,
},
Benchmark {
name: "naga::compact",
func: shader::compact,
},
Benchmark {
name: "naga::back",
func: shader::backends,
},
Benchmark {
name: "Renderpass Encoding",
func: renderpass::run_bench,
},
Benchmark {
name: "Computepass Encoding",
func: computepass::run_bench,
},
];
wgpu_benchmark::main(benchmarks);
}

View File

@ -3,34 +3,33 @@ use std::{
time::{Duration, Instant},
};
use criterion::{criterion_group, Criterion, Throughput};
use nanorand::{Rng, WyRand};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::sync::LazyLock;
use wgpu_benchmark::{iter_many, BenchmarkContext, LoopControl};
use crate::{is_test, DeviceState};
use crate::DeviceState;
fn draw_count() -> usize {
fn draw_count(ctx: &BenchmarkContext) -> u32 {
// When testing we only want to run a very lightweight version of the benchmark
// to ensure that it does not break.
if is_test() {
if ctx.is_test() {
8
} else {
10_000
}
}
fn thread_count_list() -> &'static [usize] {
if is_test() {
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [u32] {
if ctx.is_test() {
&[2]
} else {
&[1, 2, 4, 8]
&[1, 2, 4]
}
}
// Must match the number of textures in the renderpass.wgsl shader
const TEXTURES_PER_DRAW: usize = 7;
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
const TEXTURES_PER_DRAW: u32 = 7;
const VERTEX_BUFFERS_PER_DRAW: u32 = 2;
struct RenderpassState {
device_state: DeviceState,
@ -47,10 +46,10 @@ struct RenderpassState {
impl RenderpassState {
/// Create and prepare all the resources needed for the renderpass benchmark.
fn new() -> Self {
fn new(ctx: &BenchmarkContext) -> Self {
let device_state = DeviceState::new();
let draw_count = draw_count();
let draw_count = draw_count(ctx);
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
let texture_count = draw_count * TEXTURES_PER_DRAW;
@ -69,10 +68,10 @@ impl RenderpassState {
// well defined usage order.
let mut random = WyRand::new_seed(0x8BADF00D);
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
for i in 0..TEXTURES_PER_DRAW {
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
binding: i as u32,
binding: i,
visibility: wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
@ -91,7 +90,7 @@ impl RenderpassState {
entries: &bind_group_layout_entries,
});
let mut texture_views = Vec::with_capacity(texture_count);
let mut texture_views = Vec::with_capacity(texture_count as usize);
for i in 0..texture_count {
let texture = device_state
.device
@ -118,14 +117,14 @@ impl RenderpassState {
let texture_view_refs: Vec<_> = texture_views.iter().collect();
let mut bind_groups = Vec::with_capacity(draw_count);
let mut bind_groups = Vec::with_capacity(draw_count as usize);
for draw_idx in 0..draw_count {
let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
for tex_idx in 0..TEXTURES_PER_DRAW {
entries.push(wgpu::BindGroupEntry {
binding: tex_idx as u32,
binding: tex_idx,
resource: wgpu::BindingResource::TextureView(
&texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
&texture_views[(draw_idx * TEXTURES_PER_DRAW + tex_idx) as usize],
),
});
}
@ -155,7 +154,7 @@ impl RenderpassState {
push_constant_ranges: &[],
});
let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count);
let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count as usize);
for _ in 0..vertex_buffer_count {
vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
@ -166,7 +165,7 @@ impl RenderpassState {
}
random.shuffle(&mut vertex_buffers);
let mut index_buffers = Vec::with_capacity(draw_count);
let mut index_buffers = Vec::with_capacity(draw_count as usize);
for _ in 0..draw_count {
index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
@ -177,12 +176,12 @@ impl RenderpassState {
}
random.shuffle(&mut index_buffers);
let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
vertex_buffer_attributes.push(wgpu::vertex_attr_array![i => Float32x4]);
}
let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
for attributes in &vertex_buffer_attributes {
vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
array_stride: 16,
@ -263,7 +262,7 @@ impl RenderpassState {
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: Some(NonZeroU32::new(texture_count as u32).unwrap()),
count: Some(NonZeroU32::new(texture_count).unwrap()),
}],
});
@ -343,9 +342,9 @@ impl RenderpassState {
fn run_subpass(
&self,
pass_number: usize,
total_passes: usize,
draw_count: usize,
pass_number: u32,
total_passes: u32,
draw_count: u32,
) -> wgpu::CommandBuffer {
profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
@ -377,15 +376,16 @@ impl RenderpassState {
let end_idx = start_idx + draws_per_pass;
for draw_idx in start_idx..end_idx {
render_pass.set_pipeline(&self.pipeline);
render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
render_pass.set_bind_group(0, &self.bind_groups[draw_idx as usize], &[]);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
render_pass.set_vertex_buffer(
i as u32,
self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
i,
self.vertex_buffers[(draw_idx * VERTEX_BUFFERS_PER_DRAW + i) as usize]
.slice(..),
);
}
render_pass.set_index_buffer(
self.index_buffers[draw_idx].slice(..),
self.index_buffers[draw_idx as usize].slice(..),
wgpu::IndexFormat::Uint32,
);
render_pass.draw_indexed(0..3, 0, 0..1);
@ -396,7 +396,7 @@ impl RenderpassState {
encoder.finish()
}
fn run_bindless_pass(&self, draw_count: usize) -> wgpu::CommandBuffer {
fn run_bindless_pass(&self, draw_count: u32) -> wgpu::CommandBuffer {
profiling::scope!("Bindless Renderpass");
let mut encoder = self
@ -424,12 +424,12 @@ impl RenderpassState {
render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
render_pass.set_bind_group(0, Some(self.bindless_bind_group.as_ref().unwrap()), &[]);
for i in 0..VERTEX_BUFFERS_PER_DRAW {
render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
render_pass.set_vertex_buffer(i, self.vertex_buffers[0].slice(..));
}
render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
for draw_idx in 0..draw_count {
render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
render_pass.draw_indexed(0..3, 0, draw_idx..draw_idx + 1);
}
drop(render_pass);
@ -438,178 +438,103 @@ impl RenderpassState {
}
}
fn run_bench(ctx: &mut Criterion) {
let state = LazyLock::new(RenderpassState::new);
pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
let state = RenderpassState::new(&ctx);
let draw_count = draw_count();
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
let texture_count = draw_count * TEXTURES_PER_DRAW;
ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
}
let draw_count = draw_count(&ctx);
let mut results = Vec::new();
// Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
group.throughput(Throughput::Elements(draw_count as _));
for &rpasses in thread_count_list(&ctx) {
let labels = vec![
format!("Encoding ({rpasses} passes)"),
format!("Submit ({rpasses} passes)"),
];
for time_submit in [false, true] {
for &rpasses in thread_count_list() {
let draws_per_pass = draw_count / rpasses;
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses as usize);
let encoding_start = Instant::now();
for i in 0..rpasses {
buffers.push(state.run_subpass(i, rpasses, draw_count));
}
let encoding_duration = encoding_start.elapsed();
let label = if time_submit {
"Submit Time"
} else {
"Renderpass Time"
};
let submit_start = Instant::now();
state.device_state.queue.submit(buffers);
let submit_duration = submit_start.elapsed();
group.bench_function(
format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
|b| {
LazyLock::force(&state);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs(1);
}
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let mut start = Instant::now();
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
for i in 0..rpasses {
buffers.push(state.run_subpass(i, rpasses, draw_count));
}
if time_submit {
start = Instant::now();
} else {
duration += start.elapsed();
}
state.device_state.queue.submit(buffers);
if time_submit {
duration += start.elapsed();
}
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
},
);
}
vec![encoding_duration, submit_duration]
}));
}
group.finish();
// Test 10k draw calls split up over 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
group.throughput(Throughput::Elements(draw_count as _));
for &threads in thread_count_list(&ctx) {
let labels = vec![
format!("Encoding ({threads} threads)"),
format!("Submit ({threads} threads)"),
];
for &threads in thread_count_list() {
let draws_per_pass = draw_count / threads;
group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
LazyLock::force(&state);
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
let encoding_start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(i, threads, draw_count))
.collect::<Vec<_>>();
let encoding_duration = encoding_start.elapsed();
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
let submit_start = Instant::now();
state.device_state.queue.submit(buffers);
let submit_duration = submit_start.elapsed();
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
if state.device_state.adapter_info.name.contains("Paravirtual") {
return Duration::from_secs_f32(1.0);
}
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|i| state.run_subpass(i, threads, draw_count))
.collect::<Vec<_>>();
duration += start.elapsed();
state.device_state.queue.submit(buffers);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
});
vec![encoding_duration, submit_duration]
}));
}
group.finish();
// Test 10k draw calls split up over 1, 2, 4, and 8 threads.
let mut group = ctx.benchmark_group("Renderpass: Bindless");
group.throughput(Throughput::Elements(draw_count as _));
// Test 10k draw calls with bindless rendering.
if state.bindless_bind_group.is_some() {
let labels = vec![
"Encoding (bindless)".to_string(),
"Submit (bindless)".to_string(),
];
group.bench_function(format!("{draw_count} draws"), |b| {
LazyLock::force(&state);
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
let encoding_start = Instant::now();
let buffer = state.run_bindless_pass(draw_count);
let encoding_duration = encoding_start.elapsed();
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
let submit_start = Instant::now();
state.device_state.queue.submit([buffer]);
let submit_duration = submit_start.elapsed();
// Need bindless to run this benchmark
if state.bindless_bind_group.is_none() {
return Duration::from_secs_f32(1.0);
}
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
let mut duration = Duration::ZERO;
vec![encoding_duration, submit_duration]
}));
}
for _ in 0..iters {
profiling::scope!("benchmark iteration");
let start = Instant::now();
let buffer = state.run_bindless_pass(draw_count);
duration += start.elapsed();
state.device_state.queue.submit([buffer]);
state
.device_state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
});
group.finish();
ctx.bench_function(
&format!(
"Renderpass: Empty Submit with {} Resources",
texture_count + vertex_buffer_count
),
|b| {
LazyLock::force(&state);
b.iter(|| state.device_state.queue.submit([]));
},
);
}
criterion_group! {
name = renderpass;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
Ok(results)
}

View File

@ -1,82 +1,62 @@
use std::time::{Duration, Instant};
use std::time::Instant;
use criterion::{criterion_group, Criterion, Throughput};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::sync::LazyLock;
use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};
use crate::{is_test, DeviceState};
use crate::DeviceState;
fn thread_count_list() -> &'static [usize] {
if is_test() {
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
if ctx.is_test() {
&[2]
} else {
&[1, 2, 4, 8]
}
}
fn run_bench(ctx: &mut Criterion) {
let state = LazyLock::new(DeviceState::new);
pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
let state = DeviceState::new();
const RESOURCES_TO_CREATE: usize = 8;
let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
for &threads in thread_count_list() {
let mut results = Vec::new();
for &threads in thread_count_list(&ctx) {
let resources_per_thread = RESOURCES_TO_CREATE / threads;
group.bench_function(
format!("{threads} threads x {resources_per_thread} resource"),
|b| {
LazyLock::force(&state);
b.iter_custom(|iters| {
profiling::scope!("benchmark invocation");
let mut duration = Duration::ZERO;
for _ in 0..iters {
profiling::scope!("benchmark iteration");
// We can't create too many resources at once, so we do it 8 resources at a time.
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
results.push(iter(
&ctx,
&format!("{threads} threads"),
"buffers",
RESOURCES_TO_CREATE as u32,
|| {
let start = Instant::now();
let buffers = (0..threads)
.into_par_iter()
.map(|_| {
(0..resources_per_thread)
.map(|_| {
(0..resources_per_thread)
.map(|_| {
state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size: 256 * 1024 * 1024,
usage: wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
})
})
.collect::<Vec<_>>()
state.device.create_buffer(&wgpu::BufferDescriptor {
label: None,
size: 256 * 1024 * 1024,
usage: wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
})
})
.collect::<Vec<_>>();
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
let duration = start.elapsed();
duration += start.elapsed();
drop(buffers);
drop(buffers);
state.queue.submit([]);
state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
state.queue.submit([]);
state
.device
.poll(wgpu::PollType::wait_indefinitely())
.unwrap();
}
duration
})
duration
},
);
));
}
group.finish();
}
criterion_group! {
name = resource_creation;
config = Criterion::default().measurement_time(Duration::from_secs(10));
targets = run_bench,
Ok(results)
}

View File

@ -1,5 +1,5 @@
use criterion::*;
use std::{fs, process::Command};
use wgpu_benchmark::{iter_auto, BenchmarkContext, SubBenchResult};
const DIR_IN: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../naga/tests/in");
@ -141,28 +141,32 @@ fn get_wgsl_inputs() -> Inputs {
Inputs { inner: inputs }
}
fn frontends(c: &mut Criterion) {
let mut group = c.benchmark_group("front");
pub fn frontends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
let mut results = Vec::new();
let mut inputs_wgsl = get_wgsl_inputs();
group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
group.bench_function("shader: naga module bincode decode", |b| {
inputs_wgsl.parse();
inputs_wgsl.parse();
inputs_wgsl.load_utf8();
let inputs_bin = inputs_wgsl
.inner
.iter()
.map(|input| {
bincode::serde::encode_to_vec(
input.module.as_ref().unwrap(),
bincode::config::standard(),
)
.unwrap()
})
.collect::<Vec<_>>();
let inputs_bin = inputs_wgsl
.inner
.iter()
.map(|input| {
bincode::serde::encode_to_vec(
input.module.as_ref().unwrap(),
bincode::config::standard(),
)
.unwrap()
})
.collect::<Vec<_>>();
b.iter(move || {
results.push(iter_auto(
&ctx,
"bincode decode",
"bytes",
inputs_wgsl.bytes() as u32,
move || {
for input in inputs_bin.iter() {
bincode::serde::decode_from_slice::<naga::Module, _>(
input,
@ -170,20 +174,23 @@ fn frontends(c: &mut Criterion) {
)
.unwrap();
}
});
});
},
));
group.bench_function("shader: wgsl-in", |b| {
inputs_wgsl.load_utf8();
let mut frontend = naga::front::wgsl::Frontend::new();
let mut frontend = naga::front::wgsl::Frontend::new();
b.iter(|| {
results.push(iter_auto(
&ctx,
"wgsl",
"bytes",
inputs_wgsl.bytes() as u32,
|| {
for input in &inputs_wgsl.inner {
frontend.set_options((&input.options.wgsl_in).into());
frontend.parse(input.string.as_ref().unwrap()).unwrap();
}
});
});
},
));
let inputs_spirv = Inputs::from_dir("spv", "spvasm");
assert!(!inputs_spirv.is_empty());
@ -220,13 +227,16 @@ fn frontends(c: &mut Criterion) {
assembled_spirv.push(bytemuck::pod_collect_to_vec(&output.stdout));
}
let total_bytes = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();
let total_bytes: u64 = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();
assert!(assembled_spirv.len() == inputs_spirv.inner.len() || assembled_spirv.is_empty());
group.throughput(Throughput::Bytes(total_bytes));
group.bench_function("shader: spv-in", |b| {
b.iter(|| {
results.push(iter_auto(
&ctx,
"spv parse",
"bytes",
total_bytes as u32,
|| {
for (i, input) in assembled_spirv.iter().enumerate() {
let params = &inputs_spirv.inner[i].options;
let SpirvInParameters {
@ -243,140 +253,152 @@ fn frontends(c: &mut Criterion) {
);
parser.parse().unwrap();
}
});
});
},
));
let mut inputs_vertex = Inputs::from_dir("glsl", "vert");
let mut inputs_fragment = Inputs::from_dir("glsl", "frag");
let mut inputs_compute = Inputs::from_dir("glsl", "comp");
assert!(!inputs_vertex.is_empty());
assert!(!inputs_fragment.is_empty());
// let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
group.throughput(Throughput::Bytes(
inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
));
group.bench_function("shader: glsl-in", |b| {
inputs_vertex.load();
inputs_vertex.load_utf8();
inputs_fragment.load_utf8();
// inputs_compute.load_utf8();
assert!(!inputs_compute.is_empty());
b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
// TODO: This one hangs for some reason
// b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
});
inputs_vertex.load_utf8();
inputs_fragment.load_utf8();
inputs_compute.load_utf8();
results.push(iter_auto(
&ctx,
"glsl parse",
"bytes",
(inputs_vertex.bytes() + inputs_fragment.bytes() + inputs_compute.bytes()) as u32,
|| {
parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex);
parse_glsl(naga::ShaderStage::Fragment, &inputs_fragment);
parse_glsl(naga::ShaderStage::Compute, &inputs_compute);
},
));
Ok(results)
}
fn validation(c: &mut Criterion) {
pub fn validation(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
let mut results = Vec::new();
let mut inputs = get_wgsl_inputs();
let mut group = c.benchmark_group("validate");
group.throughput(Throughput::Bytes(inputs.bytes()));
group.bench_function("shader: validation", |b| {
inputs.load();
inputs.load_utf8();
inputs.parse();
inputs.parse();
let mut validator = naga::valid::Validator::new(
naga::valid::ValidationFlags::all(),
naga::valid::Capabilities::all(),
);
validator
.subgroup_stages(naga::valid::ShaderStages::all())
.subgroup_operations(naga::valid::SubgroupOperationSet::all());
b.iter(|| {
let mut validator = naga::valid::Validator::new(
naga::valid::ValidationFlags::all(),
naga::valid::Capabilities::all(),
);
validator
.subgroup_stages(naga::valid::ShaderStages::all())
.subgroup_operations(naga::valid::SubgroupOperationSet::all());
results.push(iter_auto(
&ctx,
"validation",
"bytes",
inputs.bytes() as u32,
|| {
for input in &inputs.inner {
validator.validate(input.module.as_ref().unwrap()).unwrap();
}
});
});
group.finish();
},
));
Ok(results)
}
fn compact(c: &mut Criterion) {
pub fn compact(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
use naga::compact::{compact, KeepUnused};
let mut results = Vec::new();
let mut inputs = get_wgsl_inputs();
inputs.validate();
assert!(!inputs.is_empty());
let mut group = c.benchmark_group("compact");
group.throughput(Throughput::Bytes(inputs.bytes()));
group.bench_function("shader: compact", |b| {
b.iter(|| {
results.push(iter_auto(
&ctx,
"compact",
"bytes",
inputs.bytes() as u32,
|| {
for input in &mut inputs.inner {
compact(input.module.as_mut().unwrap(), KeepUnused::No);
}
});
});
group.finish();
},
));
Ok(results)
}
fn backends(c: &mut Criterion) {
pub fn backends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
let mut results = Vec::new();
let mut inputs = get_wgsl_inputs();
let mut group = c.benchmark_group("back");
// While normally this would be done inside the bench_function callback, we need to
// run this to properly know the size of the inputs, as any that fail validation
// will be removed.
inputs.validate();
assert!(!inputs.is_empty());
group.throughput(Throughput::Bytes(inputs.bytes()));
group.bench_function("shader: wgsl-out", |b| {
b.iter(|| {
let mut string = String::new();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::WGSL) {
let mut writer =
naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
let total_bytes = inputs.bytes() as u32;
results.push(iter_auto(&ctx, "wgsl", "bytes", total_bytes, || {
let mut string = String::new();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::WGSL) {
let mut writer =
naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
);
string.clear();
}
}
}));
results.push(iter_auto(&ctx, "spv", "bytes", total_bytes, || {
let mut data = Vec::new();
let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::SPIRV) {
if input.filename().contains("pointer-function-arg") {
continue;
}
let opt = input
.options
.spv
.to_options(input.options.bounds_check_policies, None);
if writer.set_options(&opt).is_ok() {
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
None,
&None,
&mut data,
);
string.clear();
data.clear();
}
}
});
});
}
}));
group.bench_function("shader: spv-out", |b| {
b.iter(|| {
let mut data = Vec::new();
let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::SPIRV) {
if input.filename().contains("pointer-function-arg") {
// These fail due to https://github.com/gfx-rs/wgpu/issues/7315
continue;
}
let opt = input
.options
.spv
.to_options(input.options.bounds_check_policies, None);
if writer.set_options(&opt).is_ok() {
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
None,
&None,
&mut data,
);
data.clear();
}
}
}
});
});
group.bench_function("shader: spv-out multiple entrypoints", |b| {
b.iter(|| {
results.push(iter_auto(
&ctx,
"spv multiple entrypoints",
"bytes",
total_bytes,
|| {
let mut data = Vec::new();
let options = naga::back::spv::Options::default();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::SPIRV) {
if input.filename().contains("pointer-function-arg") {
// These fail due to https://github.com/gfx-rs/wgpu/issues/7315
continue;
}
let mut writer = naga::back::spv::Writer::new(&options).unwrap();
@ -397,51 +419,51 @@ fn backends(c: &mut Criterion) {
}
}
}
});
});
},
));
group.bench_function("shader: msl-out", |b| {
b.iter(|| {
let mut string = String::new();
let options = naga::back::msl::Options::default();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::METAL) {
let pipeline_options = naga::back::msl::PipelineOptions::default();
let mut writer = naga::back::msl::Writer::new(&mut string);
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
&options,
&pipeline_options,
);
string.clear();
}
results.push(iter_auto(&ctx, "msl", "bytes", total_bytes, || {
let mut string = String::new();
let options = naga::back::msl::Options::default();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::METAL) {
let pipeline_options = naga::back::msl::PipelineOptions::default();
let mut writer = naga::back::msl::Writer::new(&mut string);
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
&options,
&pipeline_options,
);
string.clear();
}
});
});
}
}));
group.bench_function("shader: hlsl-out", |b| {
b.iter(|| {
let options = naga::back::hlsl::Options::default();
let mut string = String::new();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::HLSL) {
let pipeline_options = Default::default();
let mut writer =
naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
None,
); // may fail on unimplemented things
string.clear();
}
results.push(iter_auto(&ctx, "hlsl", "bytes", total_bytes, || {
let options = naga::back::hlsl::Options::default();
let mut string = String::new();
for input in &inputs.inner {
if input.options.targets.unwrap().contains(Targets::HLSL) {
let pipeline_options = Default::default();
let mut writer =
naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
let _ = writer.write(
input.module.as_ref().unwrap(),
input.module_info.as_ref().unwrap(),
None,
);
string.clear();
}
});
});
}
}));
group.bench_function("shader: glsl-out multiple entrypoints", |b| {
b.iter(|| {
results.push(iter_auto(
&ctx,
"glsl multiple entrypoints",
"bytes",
total_bytes,
|| {
let mut string = String::new();
let options = naga::back::glsl::Options {
version: naga::back::glsl::Version::new_gles(320),
@ -462,7 +484,6 @@ fn backends(c: &mut Criterion) {
multiview: None,
};
// might be `Err` if missing features
if let Ok(mut writer) = naga::back::glsl::Writer::new(
&mut string,
module,
@ -471,14 +492,14 @@ fn backends(c: &mut Criterion) {
&pipeline_options,
naga::proc::BoundsCheckPolicies::default(),
) {
let _ = writer.write(); // might be `Err` if unsupported
let _ = writer.write();
}
string.clear();
}
}
});
});
}
},
));
criterion_group!(shader, frontends, validation, compact, backends);
Ok(results)
}

34
benches/src/context.rs Normal file
View File

@ -0,0 +1,34 @@
use std::time::Duration;
#[derive(Clone, Copy)]
pub enum LoopControl {
Iterations(u32),
Time(Duration),
}
impl Default for LoopControl {
fn default() -> Self {
LoopControl::Time(Duration::from_secs(2))
}
}
impl LoopControl {
pub(crate) fn finished(&self, iterations: u32, elapsed: Duration) -> bool {
match self {
LoopControl::Iterations(target) => iterations >= *target,
LoopControl::Time(target) => elapsed >= *target,
}
}
}
pub struct BenchmarkContext {
pub(crate) override_iters: Option<LoopControl>,
pub default_iterations: LoopControl,
pub(crate) is_test: bool,
}
impl BenchmarkContext {
pub fn is_test(&self) -> bool {
self.is_test
}
}

27
benches/src/file.rs Normal file
View File

@ -0,0 +1,27 @@
use anyhow::Context as _;
use crate::BenchmarkFile;
const FILE_PREFIX: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../target/bench/");
pub const PREVIOUS: &str = "previous";
pub(crate) fn get_comparison_file(baseline: Option<&str>) -> Option<BenchmarkFile> {
let file_name = baseline.unwrap_or(PREVIOUS);
let path = format!("{FILE_PREFIX}{file_name}.json");
let file = std::fs::read_to_string(path).ok()?;
let benchmark_file: BenchmarkFile = serde_json::from_str(&file).ok()?;
Some(benchmark_file)
}
pub(crate) fn write_results_file(
file_name: &str,
output_file: &BenchmarkFile,
) -> anyhow::Result<()> {
let path = format!("{FILE_PREFIX}{file_name}.json");
let json = serde_json::to_string_pretty(output_file)?;
std::fs::create_dir_all(FILE_PREFIX)
.with_context(|| format!("Trying to create directory {FILE_PREFIX}"))?;
std::fs::write(&path, json).with_context(|| format!("Trying to write file {path}"))?;
Ok(())
}

97
benches/src/iter.rs Normal file
View File

@ -0,0 +1,97 @@
use std::time::Duration;
use crate::{BenchmarkContext, LoopControl, SubBenchResult};
pub fn iter(
ctx: &BenchmarkContext,
name: &str,
throughput_unit: &str,
throughput_count_per_iteration: u32,
mut f: impl FnMut() -> Duration,
) -> SubBenchResult {
profiling::scope!("iter", name);
let mut iterations = 0_u32;
let mut duration = Duration::ZERO;
let control = if let Some(override_control) = ctx.override_iters {
override_control
} else {
ctx.default_iterations
};
while !control.finished(iterations, duration) {
duration += f();
iterations += 1;
}
SubBenchResult {
name: name.to_string(),
avg_duration_per_iteration: duration / iterations,
iterations,
throughput_unit: throughput_unit.to_string(),
throughput_count_per_iteration,
}
}
pub fn iter_auto(
ctx: &BenchmarkContext,
name: &str,
throughput_unit: &str,
throughput_count_per_iteration: u32,
mut f: impl FnMut(),
) -> SubBenchResult {
iter(
ctx,
name,
throughput_unit,
throughput_count_per_iteration,
|| {
let start = std::time::Instant::now();
f();
start.elapsed()
},
)
}
pub fn iter_many(
ctx: &BenchmarkContext,
names: Vec<String>,
throughput_unit: &str,
throughput_count_per_iteration: u32,
mut f: impl FnMut() -> Vec<Duration>,
) -> Vec<SubBenchResult> {
profiling::scope!("iter", &*names[0]);
let mut iterations = 0_u32;
let mut durations = vec![Duration::ZERO; names.len()];
let control = if let Some(override_control) = ctx.override_iters {
override_control
} else {
LoopControl::Time(Duration::from_secs(1))
};
// We use the first duration to determine whether to stop. This means the other sub-benchmarks
// could have run for longer or shorter than intended, but that's acceptable.
while !control.finished(iterations, *durations.first().unwrap_or(&Duration::ZERO)) {
let iteration_durations = f();
assert_eq!(iteration_durations.len(), names.len());
for (i, dur) in iteration_durations.into_iter().enumerate() {
durations[i] += dur;
}
iterations += 1;
}
durations
.into_iter()
.enumerate()
.map(|(i, d)| SubBenchResult {
name: names[i].to_string(),
avg_duration_per_iteration: d / iterations,
iterations,
throughput_unit: throughput_unit.to_string(),
throughput_count_per_iteration,
})
.collect()
}

261
benches/src/lib.rs Normal file
View File

@ -0,0 +1,261 @@
#![cfg(not(target_arch = "wasm32"))]
#![expect(clippy::disallowed_types)] // We're outside of the main wgpu codebase
//! Benchmarking framework for `wgpu`.
//!
//! This crate is a basic framework for benchmarking. Its design is guided
//! by a few goals:
//!
//! - Enumerating tests should be extremely cheap. `criterion` needs
//! to run all of your benchmark functions to enumerate them during
//! testing. This requires your code to contort itself to avoid doing
//! any work until you enter a benchmark callback. This framework
//! avoids that by having an explicit list of benchmark function.
//! - It must be compatible with `cargo-nextest` and have a compatible
//! "test" mode that runs each benchmark exactly once.
//! - It should be able to have intuitive test grouping, allowing for
//! allowing for quick execution of a reasonable baseline set of benchmarks
//! during development, while still allowing for a more exhaustive
//! benchmark suite to be run if desired.
//!
//! By default all tests run for 2 seconds, but this can be overridden
//! by individual tests.
use std::{collections::HashMap, io::IsTerminal, time::Duration};
use anyhow::Result;
use pico_args::Arguments;
use serde::{Deserialize, Serialize};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
mod context;
mod file;
mod iter;
mod print;
pub use context::*;
pub use iter::*;
use crate::file::PREVIOUS;
#[derive(Serialize, Deserialize, Default)]
pub struct BenchmarkFile {
pub results: HashMap<String, Vec<SubBenchResult>>,
}
impl BenchmarkFile {
pub fn get_result(
&self,
benchmark_name: &str,
sub_benchmark_name: &str,
) -> Option<&SubBenchResult> {
self.results
.get(benchmark_name)?
.iter()
.find(|r| r.name == sub_benchmark_name)
}
}
#[derive(Serialize, Deserialize)]
pub struct SubBenchResult {
/// Name of the subbenchmark.
pub name: String,
/// Average duration per iteration of the subbenchmark.
pub avg_duration_per_iteration: Duration,
/// Total number of iterations executed.
pub iterations: u32,
/// Throughput unit description. e.g., "bytes", "elements", etc.
pub throughput_unit: String,
/// Number of throughput units processed per iteration.
pub throughput_count_per_iteration: u32,
}
impl SubBenchResult {
pub fn throughput_per_second(&self) -> f64 {
let secs_f64 = self.avg_duration_per_iteration.as_secs_f64();
if secs_f64 == 0.0 {
return 0.0;
}
self.throughput_count_per_iteration as f64 / secs_f64
}
}
pub struct Benchmark {
pub name: &'static str,
pub func: fn(BenchmarkContext) -> Result<Vec<SubBenchResult>>,
}
const HELP: &str = "\
Usage: wgpu-benchmark [OPTIONS] [BENCHMARK_NAME]
Modes:
--bench Run in benchmark mode, comparing against previous results.
--list List available benchmarks.
<no flag> Run in test mode, executing each benchmark exactly once.
Test Matching:
--exact When specifying BENCHMARK_NAME, only run exact matches.
BENCHMARK_NAME Only run benchmarks whose names contain this substring.
Comparison:
-b, --baseline NAME Specify a baseline file for comparison.
-s, --save-baseline NAME Save the results as a baseline file.
Timings:
--iters N Override number of iterations per benchmark.
--time SECONDS Override time per benchmark in seconds.
Other:
--color Set colored output (always,always-ansi,auto,never).
--format terse Specify --list output format (only 'terse' is supported).
--no-capture (Ignored)
";
pub fn main(benchmarks: Vec<Benchmark>) {
let mut args = Arguments::from_env();
let help = args.contains(["-h", "--help"]);
if help {
println!("{HELP}");
return;
}
let mut color: ColorChoice = args
.opt_value_from_str("--color")
.unwrap_or(None)
.unwrap_or(ColorChoice::Auto);
if color == ColorChoice::Auto && !std::io::stdin().is_terminal() {
color = ColorChoice::Never;
}
let exact = args.contains("--exact");
// We don't actually need this flag, but cargo-nextest passes it in
// test mode, so we need to accept it.
let _no_capture = args.contains("--no-capture");
#[expect(clippy::manual_map)] // So much clearer this way
let mut override_iterations = if let Some(iters) = args.opt_value_from_str("--iters").unwrap() {
Some(LoopControl::Iterations(iters))
} else if let Some(seconds) = args.opt_value_from_str("--time").unwrap() {
Some(LoopControl::Time(Duration::from_secs_f64(seconds)))
} else {
None
};
let baseline_name: Option<String> = args.opt_value_from_str(["-b", "--baseline"]).unwrap();
let write_baseline: Option<String> =
args.opt_value_from_str(["-s", "--save-baseline"]).unwrap();
let is_bench = args.contains("--bench");
let is_list = args.contains("--list");
let is_test = !is_bench && !is_list;
let format: Option<String> = args.opt_value_from_str("--format").unwrap();
if let Some(fmt) = format {
assert_eq!(fmt, "terse", "Only 'terse' format is supported.");
}
if let Some(ref baseline) = baseline_name {
if baseline == PREVIOUS {
eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
return;
}
}
if let Some(ref write_baseline) = write_baseline {
if write_baseline == PREVIOUS {
eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
return;
}
}
if override_iterations.is_none() && is_test {
override_iterations = Some(LoopControl::Iterations(1));
}
let name = args.free_from_str::<String>().ok();
let baseline = if is_bench {
let res = file::get_comparison_file(baseline_name.as_deref());
match (&res, baseline_name.as_deref()) {
(Some(_), Some(baseline)) => {
println!("Using baseline \"{baseline}\" for comparison.\n")
}
(None, Some(baseline)) => {
eprintln!("Could not find baseline named {baseline:?}.\n");
return;
}
(Some(_), None) => {
println!("Using previous benchmark results for comparison.\n");
}
(None, None) => {
println!("No previous benchmark results found for comparison.\n");
}
}
res
} else {
None
};
let mut output_file = BenchmarkFile::default();
let mut stdout = StandardStream::stdout(color);
for bench in benchmarks {
if let Some(ref bench_name) = name {
if exact {
if bench.name != bench_name {
continue;
}
} else if !bench.name.contains(bench_name) {
continue;
}
}
if is_list {
println!("{}: benchmark", bench.name);
continue;
}
let ctx = BenchmarkContext {
override_iters: override_iterations,
default_iterations: LoopControl::default(),
is_test,
};
stdout
.set_color(ColorSpec::new().set_fg(Some(Color::Blue)))
.unwrap();
println!("Running benchmark: {}", bench.name);
stdout.reset().unwrap();
let results = {
profiling::scope!("bench", bench.name);
let r = (bench.func)(ctx);
match r {
Ok(r) => r,
Err(e) => {
eprintln!(" Error running benchmark '{}': {:?}", bench.name, e);
continue;
}
}
};
let previous_results = if let Some(ref baseline) = baseline {
baseline.results.get(bench.name).map(|r| r.as_slice())
} else {
None
};
print::print_results(&mut stdout, &results, previous_results);
output_file.results.insert(bench.name.to_string(), results);
}
file::write_results_file(PREVIOUS, &output_file).unwrap();
if let Some(output_baseline) = write_baseline {
file::write_results_file(&output_baseline, &output_file).unwrap();
}
}

206
benches/src/print.rs Normal file
View File

@ -0,0 +1,206 @@
use std::collections::HashMap;
use std::io::Write;
use termcolor::{Color, ColorSpec, StandardStream, WriteColor};
use crate::SubBenchResult;
#[derive(Default, Clone)]
struct Delta {
throughput_change_str: String,
throughput_change: f64,
time_change_str: String,
time_change: f64,
}
impl Delta {
fn new(previous: &SubBenchResult, current: &SubBenchResult) -> Self {
let prev_throughput = previous.throughput_per_second();
let curr_throughput = current.throughput_per_second();
let delta_throughput = if prev_throughput != 0.0 {
(curr_throughput - prev_throughput) / prev_throughput * 100.0
} else {
0.0
};
let throughput_change = format!(" ({delta_throughput:+.2}%)");
let prev_time = previous.avg_duration_per_iteration;
let curr_time = current.avg_duration_per_iteration;
let delta_time = if prev_time.as_nanos() != 0 {
(curr_time.as_secs_f64() - prev_time.as_secs_f64()) / prev_time.as_secs_f64() * 100.0
} else {
0.0
};
let time_change = format!("{delta_time:+.2}%; ");
Delta {
throughput_change_str: throughput_change,
throughput_change: delta_throughput,
time_change_str: time_change,
time_change: delta_time,
}
}
}
/// Get a color spec for the given change percentage.
///
/// Positive changes are red (regression), negative changes are green (improvement).
/// This represents changes for time durations. For throughput changes, the sign should be inverted
/// before passing to this method.
fn get_change_color(percent_change: f64) -> ColorSpec {
let mut color_spec = ColorSpec::new();
if percent_change > 3.0 {
color_spec.set_fg(Some(Color::Red));
} else if percent_change < -3.0 {
color_spec.set_fg(Some(Color::Green));
} else {
color_spec.set_fg(Some(Color::Yellow));
}
if percent_change.abs() > 15.0 {
color_spec.set_intense(true);
}
color_spec
}
pub fn print_results(
stdout: &mut StandardStream,
results: &[SubBenchResult],
previous_results: Option<&[SubBenchResult]>,
) {
let mut deltas = HashMap::new();
if let Some(previous_results) = previous_results {
for result in results {
if let Some(previous_result) = previous_results.iter().find(|r| r.name == result.name) {
deltas.insert(result.name.clone(), Delta::new(previous_result, result));
}
}
}
let longest_throughput_change_len = deltas
.values()
.map(|d| d.throughput_change_str.len())
.max()
.unwrap_or(0);
let longest_time_change_len = deltas
.values()
.map(|d| d.time_change_str.len())
.max()
.unwrap_or(0);
let longest_name_len = results.iter().map(|r| r.name.len()).max().unwrap_or(0);
let duration_strings: Vec<String> = results
.iter()
.map(|r| format!("{:.3?}", r.avg_duration_per_iteration))
.collect();
let longest_duration_len = duration_strings.iter().map(|s| s.len()).max().unwrap_or(0);
let iterations_strings: Vec<String> = results
.iter()
.map(|r| format!("{}", r.iterations))
.collect();
let longest_iterations_len = iterations_strings
.iter()
.map(|s| s.len())
.max()
.unwrap_or(0);
let throughput_strings: Vec<String> = results
.iter()
.map(|r| {
let throughput_per_second = r.throughput_count_per_iteration as f64
/ r.avg_duration_per_iteration.as_secs_f64();
human_scale(throughput_per_second)
})
.collect();
let longest_throughput_len = throughput_strings
.iter()
.map(|s| s.len())
.max()
.unwrap_or(0);
let longest_throughput_unit_len = results
.iter()
.map(|r| r.throughput_unit.len())
.max()
.unwrap_or(0);
for (i, result) in results.iter().enumerate() {
let delta = deltas.get(&result.name).cloned().unwrap_or_default();
let time_color = get_change_color(delta.time_change);
let throughput_color = get_change_color(-delta.throughput_change);
stdout
.set_color(ColorSpec::new().set_fg(Some(Color::Cyan)))
.unwrap();
write!(stdout, " {:>longest_name_len$}: ", result.name).unwrap();
stdout.set_color(&time_color).unwrap();
write!(stdout, "{:>longest_duration_len$} ", duration_strings[i],).unwrap();
stdout.reset().unwrap();
write!(stdout, "(").unwrap();
stdout.set_color(&time_color).unwrap();
write!(
stdout,
"{:>longest_time_change_len$}",
delta.time_change_str
)
.unwrap();
stdout.reset().unwrap();
write!(
stdout,
"over {:>longest_iterations_len$} iter) ",
result.iterations,
)
.unwrap();
stdout.set_color(&throughput_color).unwrap();
write!(stdout, "{:>longest_throughput_len$}", throughput_strings[i]).unwrap();
stdout.reset().unwrap();
write!(
stdout,
" {:>longest_throughput_unit_len$}/s",
result.throughput_unit,
)
.unwrap();
stdout.set_color(&throughput_color).unwrap();
writeln!(
stdout,
"{:>longest_throughput_change_len$}",
delta.throughput_change_str
)
.unwrap();
}
println!();
}
fn human_scale(value: f64) -> String {
const PREFIXES: &[&str] = &["", "K", "M", "G", "T", "P"];
if value == 0.0 {
return "0".to_string();
}
let abs_value = value.abs();
let exponent = (abs_value.log10() / 3.0).floor() as usize;
let prefix_index = exponent.min(PREFIXES.len() - 1);
let scaled = value / 10_f64.powi((prefix_index * 3) as i32);
// Determine decimal places for 3 significant figures
let decimal_places = if scaled.abs() >= 100.0 {
0
} else if scaled.abs() >= 10.0 {
1
} else {
2
};
format!(
"{:.prec$}{}",
scaled,
PREFIXES[prefix_index],
prec = decimal_places
)
}