mirror of
https://github.com/gfx-rs/wgpu.git
synced 2025-12-08 21:26:17 +00:00
Setup and use new benchmarking harness (#8511)
This commit is contained in:
parent
6043b059c4
commit
853ad6c464
122
Cargo.lock
generated
122
Cargo.lock
generated
@ -118,12 +118,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anes"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.21"
|
||||
@ -592,12 +586,6 @@ dependencies = [
|
||||
"thiserror 2.0.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.46"
|
||||
@ -652,33 +640,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ciborium"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"ciborium-ll",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ciborium-io"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
|
||||
|
||||
[[package]]
|
||||
name = "ciborium-ll"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"half",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "1.8.1"
|
||||
@ -884,39 +845,6 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
|
||||
dependencies = [
|
||||
"anes",
|
||||
"cast",
|
||||
"ciborium",
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"itertools 0.13.0",
|
||||
"num-traits",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tinytemplate",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools 0.13.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
@ -3022,12 +2950,6 @@ version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
|
||||
|
||||
[[package]]
|
||||
name = "orbclient"
|
||||
version = "0.3.49"
|
||||
@ -3185,34 +3107,6 @@ dependencies = [
|
||||
"winit 0.29.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"plotters-backend",
|
||||
"plotters-svg",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plotters-backend"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
|
||||
|
||||
[[package]]
|
||||
name = "plotters-svg"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
|
||||
dependencies = [
|
||||
"plotters-backend",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "png"
|
||||
version = "0.18.0"
|
||||
@ -4112,16 +4006,6 @@ dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.48.0"
|
||||
@ -4800,15 +4684,19 @@ dependencies = [
|
||||
name = "wgpu-benchmark"
|
||||
version = "27.0.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode 2.0.1",
|
||||
"bytemuck",
|
||||
"criterion",
|
||||
"naga",
|
||||
"naga-test",
|
||||
"nanorand 0.8.0",
|
||||
"pico-args",
|
||||
"pollster",
|
||||
"profiling",
|
||||
"rayon",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"termcolor",
|
||||
"tracy-client",
|
||||
"wgpu",
|
||||
]
|
||||
|
||||
@ -250,7 +250,7 @@ deno_webgpu = { version = "0.181.0", path = "./deno_webgpu" }
|
||||
deno_unsync = "0.4.4"
|
||||
deno_error = "0.7.0"
|
||||
tokio = "1.47"
|
||||
termcolor = "1.1.3"
|
||||
termcolor = "1.4.1"
|
||||
|
||||
# android dependencies
|
||||
ndk-sys = "0.6"
|
||||
|
||||
@ -16,19 +16,18 @@ name = "wgpu-benchmark"
|
||||
harness = false
|
||||
|
||||
[features]
|
||||
# Uncomment these features to enable tracy and superluminal profiling.
|
||||
# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
|
||||
# superluminal = ["profiling/profile-with-superluminal"]
|
||||
tracy = ["dep:tracy-client"]
|
||||
|
||||
[lints.rust]
|
||||
unexpected_cfgs = { level = "warn", check-cfg = [
|
||||
'cfg(feature, values("tracy"))',
|
||||
] }
|
||||
|
||||
[dependencies]
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
anyhow.workspace = true
|
||||
bincode = { workspace = true, features = ["serde"] }
|
||||
bytemuck.workspace = true
|
||||
criterion.workspace = true
|
||||
# criterion.workspace = true
|
||||
naga = { workspace = true, features = [
|
||||
"deserialize",
|
||||
"serialize",
|
||||
@ -43,8 +42,12 @@ naga = { workspace = true, features = [
|
||||
] }
|
||||
naga-test = { workspace = true, features = [] }
|
||||
nanorand.workspace = true
|
||||
pico-args.workspace = true
|
||||
pollster.workspace = true
|
||||
profiling.workspace = true
|
||||
rayon.workspace = true
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json.workspace = true
|
||||
termcolor.workspace = true
|
||||
tracy-client = { workspace = true, optional = true }
|
||||
wgpu.workspace = true
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
Collection of CPU benchmarks for `wgpu`.
|
||||
|
||||
These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
|
||||
They all do very little GPU work and are testing the CPU performance of the API.
|
||||
|
||||
Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
|
||||
|
||||
## Usage
|
||||
|
||||
@ -14,65 +11,30 @@ cargo bench -p wgpu-benchmark
|
||||
cargo bench -p wgpu-benchmark -- "filter"
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
|
||||
#### `Renderpass`
|
||||
|
||||
This benchmark measures the performance of recording and submitting a render pass with a large
|
||||
number of draw calls and resources, emulating an intense, more traditional graphics application.
|
||||
By default it measures 10k draw calls, with 90k total resources.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the render pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all textures at once instead of switching
|
||||
the bind group for every draw call.
|
||||
|
||||
#### `Computepass`
|
||||
|
||||
This benchmark measures the performance of recording and submitting a compute pass with a large
|
||||
number of dispatches and resources.
|
||||
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the compute pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all resources at once instead of switching
|
||||
the bind group for every draw call.
|
||||
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
|
||||
|
||||
|
||||
#### `Resource Creation`
|
||||
|
||||
This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
|
||||
|
||||
#### `Shader Compilation`
|
||||
|
||||
This benchmark measures the performance of naga parsing, validating, and generating shaders.
|
||||
Use `WGPU_BACKEND` and `WGPU_ADAPTER_NAME` to adjust which device the benchmarks use. [More info on env vars](../README.md#environment-variables).
|
||||
|
||||
## Comparing Against a Baseline
|
||||
|
||||
To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
|
||||
|
||||
For example, to compare v0.20 against trunk, you could run the following:
|
||||
For example, to compare v28 against trunk, you could run the following:
|
||||
|
||||
```sh
|
||||
git checkout v0.20
|
||||
|
||||
git checkout v28
|
||||
# Run the baseline benchmarks
|
||||
cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
|
||||
cargo bench -p wgpu-benchmark -- --save-baseline "v28"
|
||||
|
||||
git checkout trunk
|
||||
|
||||
# Run the current benchmarks
|
||||
cargo bench -p wgpu-benchmark -- --baseline "v0.20"
|
||||
cargo bench -p wgpu-benchmark -- --baseline "v28"
|
||||
```
|
||||
|
||||
You can use this for any bits of code you want to compare.
|
||||
The current benchmarking framework was added before v28, so comparisons only work after it was added. Before that the same commands will work, but comparison will be done using `criterion`.
|
||||
|
||||
## Integration with Profilers
|
||||
|
||||
The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
|
||||
Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
|
||||
you need to uncomment the features in the `Cargo.toml` to allow features to be used.
|
||||
Integrations are available for `tracy` and `superluminal`.
|
||||
|
||||
#### Tracy
|
||||
|
||||
@ -80,7 +42,7 @@ Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/t
|
||||
|
||||
```sh
|
||||
# Once this is running, you can connect to it with the Tracy Profiler
|
||||
cargo bench -p wgpu-benchmark --features tracy
|
||||
cargo bench -p wgpu-benchmark --features tracy,profiling/profile-with-tracy
|
||||
```
|
||||
|
||||
#### Superluminal
|
||||
@ -89,10 +51,10 @@ Superluminal is a paid product for windows available [here](https://superluminal
|
||||
|
||||
```sh
|
||||
# This command will build the benchmarks, and display the path to the executable
|
||||
cargo bench -p wgpu-benchmark --features superluminal -- -h
|
||||
cargo bench -p wgpu-benchmark --features profiling/profile-with-superluminal -- -h
|
||||
|
||||
# Have Superluminal run the following command (replacing with the path to the executable)
|
||||
./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
|
||||
<path_to_exe> --bench "filter"
|
||||
```
|
||||
|
||||
#### `perf` and others
|
||||
@ -105,6 +67,42 @@ For example, the command line tool `perf` can be used to profile the benchmarks.
|
||||
cargo bench -p wgpu-benchmark -- -h
|
||||
|
||||
# Run the benchmarks with perf
|
||||
perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
|
||||
perf record <path_to_exe> --bench "filter"
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
|
||||
#### `Renderpass Encoding`
|
||||
|
||||
This benchmark measures the performance of recording and submitting a render pass with a large
|
||||
number of draw calls and resources, emulating an intense, more traditional graphics application.
|
||||
By default it measures 10k draw calls, with 90k total resources.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the render pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all textures at once instead of switching
|
||||
the bind group for every draw call.
|
||||
|
||||
#### `Computepass Encoding`
|
||||
|
||||
This benchmark measures the performance of recording and submitting a compute pass with a large
|
||||
number of dispatches and resources.
|
||||
By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
|
||||
|
||||
Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
|
||||
the compute pass into multiple passes over multiple command buffers.
|
||||
If available, it also tests a bindless approach, binding all resources at once instead of switching
|
||||
the bind group for every draw call.
|
||||
TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
|
||||
|
||||
#### `Device::create_buffer`
|
||||
|
||||
This benchmark measures the performance of creating large buffers.
|
||||
|
||||
#### `Device::create_bind_group`
|
||||
|
||||
This benchmark measures the performance of creating large bind groups of 5 to 50,000 resources.
|
||||
|
||||
#### `naga::back`, `naga::compact`, `naga::front`, and `naga::valid`
|
||||
|
||||
These benchmark measures the performance of naga parsing, validating, and generating shaders.
|
||||
|
||||
@ -1,173 +1,127 @@
|
||||
use std::{
|
||||
num::NonZeroU32,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use std::{num::NonZeroU32, time::Instant};
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use nanorand::{Rng, WyRand};
|
||||
use std::sync::LazyLock;
|
||||
use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};
|
||||
|
||||
use crate::{is_test, DeviceState};
|
||||
use crate::DeviceState;
|
||||
|
||||
struct Params {
|
||||
max_texture_count: u32,
|
||||
texture_counts: &'static [u32],
|
||||
}
|
||||
|
||||
// Creating 50_000 textures takes a considerable amount of time with syncval enabled.
|
||||
//
|
||||
// We greatly reduce the number of textures for the test case to keep the runtime
|
||||
// reasonable for testing.
|
||||
const MAX_TEXTURE_COUNT_BENCHMARK: u32 = 50_000;
|
||||
const TEXTURE_COUNTS_BENCHMARK: &[u32] = &[5, 50, 500, 5_000, 50_000];
|
||||
const BENCHMARK_PARAMS: Params = Params {
|
||||
max_texture_count: 50_000,
|
||||
texture_counts: &[5, 50, 500, 5_000, 50_000],
|
||||
};
|
||||
|
||||
const MAX_TEXTURE_COUNT_TEST: u32 = 5;
|
||||
const TEXTURE_COUNTS_TEST: &[u32] = &[5];
|
||||
const TEST_PARAMS: Params = Params {
|
||||
max_texture_count: 5,
|
||||
texture_counts: &[5],
|
||||
};
|
||||
|
||||
struct BindGroupState {
|
||||
device_state: DeviceState,
|
||||
texture_views: Vec<wgpu::TextureView>,
|
||||
}
|
||||
pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
let device_state = DeviceState::new();
|
||||
|
||||
impl BindGroupState {
|
||||
/// Create and prepare all the resources needed for the renderpass benchmark.
|
||||
fn new() -> Self {
|
||||
let device_state = DeviceState::new();
|
||||
|
||||
let texture_count = if is_test() {
|
||||
MAX_TEXTURE_COUNT_TEST
|
||||
} else {
|
||||
MAX_TEXTURE_COUNT_BENCHMARK
|
||||
};
|
||||
|
||||
// Performance gets considerably worse if the resources are shuffled.
|
||||
//
|
||||
// This more closely matches the real-world use case where resources have no
|
||||
// well defined usage order.
|
||||
let mut random = WyRand::new_seed(0x8BADF00D);
|
||||
|
||||
let mut texture_views = Vec::with_capacity(texture_count as usize);
|
||||
for i in 0..texture_count {
|
||||
let texture = device_state
|
||||
.device
|
||||
.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some(&format!("Texture {i}")),
|
||||
size: wgpu::Extent3d {
|
||||
width: 1,
|
||||
height: 1,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8UnormSrgb,
|
||||
usage: wgpu::TextureUsages::TEXTURE_BINDING,
|
||||
view_formats: &[],
|
||||
});
|
||||
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||
label: Some(&format!("Texture View {i}")),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
random.shuffle(&mut texture_views);
|
||||
|
||||
Self {
|
||||
device_state,
|
||||
texture_views,
|
||||
}
|
||||
if !device_state
|
||||
.device
|
||||
.features()
|
||||
.contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
|
||||
{
|
||||
anyhow::bail!("Device does not support required feature TEXTURE_BINDING_ARRAY");
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench(ctx: &mut Criterion) {
|
||||
let state = LazyLock::new(BindGroupState::new);
|
||||
|
||||
let mut group = ctx.benchmark_group("Bind Group Creation");
|
||||
|
||||
let count_list = if is_test() {
|
||||
TEXTURE_COUNTS_TEST
|
||||
let params = if ctx.is_test() {
|
||||
TEST_PARAMS
|
||||
} else {
|
||||
TEXTURE_COUNTS_BENCHMARK
|
||||
BENCHMARK_PARAMS
|
||||
};
|
||||
|
||||
for &count in count_list {
|
||||
group.throughput(Throughput::Elements(count as u64));
|
||||
group.bench_with_input(
|
||||
format!("{count} Element Bind Group"),
|
||||
&count,
|
||||
|b, &count| {
|
||||
b.iter_custom(|iters| {
|
||||
if !state
|
||||
.device_state
|
||||
.device
|
||||
.features()
|
||||
.contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
|
||||
{
|
||||
return Duration::ZERO;
|
||||
}
|
||||
// Performance gets considerably worse if the resources are shuffled.
|
||||
//
|
||||
// This more closely matches the real-world use case where resources have no
|
||||
// well defined usage order.
|
||||
let mut random = WyRand::new_seed(0x8BADF00D);
|
||||
|
||||
if count
|
||||
> state
|
||||
.device_state
|
||||
.device
|
||||
.limits()
|
||||
.max_sampled_textures_per_shader_stage
|
||||
{
|
||||
return Duration::ZERO;
|
||||
}
|
||||
|
||||
let bind_group_layout = state.device_state.device.create_bind_group_layout(
|
||||
&wgpu::BindGroupLayoutDescriptor {
|
||||
label: None,
|
||||
entries: &[wgpu::BindGroupLayoutEntry {
|
||||
binding: 0,
|
||||
visibility: wgpu::ShaderStages::FRAGMENT,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float {
|
||||
filterable: true,
|
||||
},
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: Some(NonZeroU32::new(count).unwrap()),
|
||||
}],
|
||||
},
|
||||
);
|
||||
|
||||
let texture_view_refs: Vec<_> =
|
||||
state.texture_views.iter().take(count as usize).collect();
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
let bind_group = state.device_state.device.create_bind_group(
|
||||
&wgpu::BindGroupDescriptor {
|
||||
layout: &bind_group_layout,
|
||||
entries: &[wgpu::BindGroupEntry {
|
||||
binding: 0,
|
||||
resource: wgpu::BindingResource::TextureViewArray(
|
||||
&texture_view_refs,
|
||||
),
|
||||
}],
|
||||
label: None,
|
||||
},
|
||||
);
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
drop(bind_group);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
});
|
||||
},
|
||||
);
|
||||
let mut texture_views = Vec::with_capacity(params.max_texture_count as usize);
|
||||
for i in 0..params.max_texture_count {
|
||||
let texture = device_state
|
||||
.device
|
||||
.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some(&format!("Texture {i}")),
|
||||
size: wgpu::Extent3d {
|
||||
width: 1,
|
||||
height: 1,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8UnormSrgb,
|
||||
usage: wgpu::TextureUsages::TEXTURE_BINDING,
|
||||
view_formats: &[],
|
||||
});
|
||||
texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
|
||||
label: Some(&format!("Texture View {i}")),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
}
|
||||
random.shuffle(&mut texture_views);
|
||||
|
||||
criterion_group! {
|
||||
name = bind_groups;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||
targets = run_bench,
|
||||
let mut results = Vec::new();
|
||||
|
||||
for &count in params.texture_counts {
|
||||
let bind_group_layout =
|
||||
device_state
|
||||
.device
|
||||
.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: None,
|
||||
entries: &[wgpu::BindGroupLayoutEntry {
|
||||
binding: 0,
|
||||
visibility: wgpu::ShaderStages::FRAGMENT,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float { filterable: true },
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: Some(NonZeroU32::new(count).unwrap()),
|
||||
}],
|
||||
});
|
||||
|
||||
let texture_view_refs: Vec<_> = texture_views.iter().take(count as usize).collect();
|
||||
|
||||
let name = format!("{count} Textures");
|
||||
|
||||
let res = iter(&ctx, &name, "bindings", count, || {
|
||||
let start = Instant::now();
|
||||
let bind_group = device_state
|
||||
.device
|
||||
.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
layout: &bind_group_layout,
|
||||
entries: &[wgpu::BindGroupEntry {
|
||||
binding: 0,
|
||||
resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
|
||||
}],
|
||||
label: None,
|
||||
});
|
||||
|
||||
let time = start.elapsed();
|
||||
|
||||
drop(bind_group);
|
||||
device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
time
|
||||
});
|
||||
|
||||
results.push(res);
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@ -3,17 +3,16 @@ use std::{
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use nanorand::{Rng, WyRand};
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use std::sync::LazyLock;
|
||||
use wgpu_benchmark::{iter_auto, iter_many, BenchmarkContext, LoopControl};
|
||||
|
||||
use crate::{is_test, DeviceState};
|
||||
use crate::DeviceState;
|
||||
|
||||
fn dispatch_count() -> usize {
|
||||
fn dispatch_count(ctx: &BenchmarkContext) -> usize {
|
||||
// When testing we only want to run a very lightweight version of the benchmark
|
||||
// to ensure that it does not break.
|
||||
if is_test() {
|
||||
if ctx.is_test() {
|
||||
8
|
||||
} else {
|
||||
10_000
|
||||
@ -25,18 +24,18 @@ fn dispatch_count() -> usize {
|
||||
// This is in fact so slow that it makes the benchmark unusable when we use the same amount of
|
||||
// resources as the regular benchmark.
|
||||
// For details see https://github.com/gfx-rs/wgpu/issues/5766
|
||||
fn dispatch_count_bindless() -> usize {
|
||||
fn dispatch_count_bindless(ctx: &BenchmarkContext) -> usize {
|
||||
// On CI we only want to run a very lightweight version of the benchmark
|
||||
// to ensure that it does not break.
|
||||
if is_test() {
|
||||
if ctx.is_test() {
|
||||
8
|
||||
} else {
|
||||
1_000
|
||||
}
|
||||
}
|
||||
|
||||
fn thread_count_list() -> &'static [usize] {
|
||||
if is_test() {
|
||||
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
|
||||
if ctx.is_test() {
|
||||
&[2]
|
||||
} else {
|
||||
&[2, 4, 8]
|
||||
@ -62,11 +61,11 @@ struct ComputepassState {
|
||||
|
||||
impl ComputepassState {
|
||||
/// Create and prepare all the resources needed for the computepass benchmark.
|
||||
fn new() -> Self {
|
||||
fn new(ctx: &BenchmarkContext) -> Self {
|
||||
let device_state = DeviceState::new();
|
||||
|
||||
let dispatch_count = dispatch_count();
|
||||
let dispatch_count_bindless = dispatch_count_bindless();
|
||||
let dispatch_count = dispatch_count(ctx);
|
||||
let dispatch_count_bindless = dispatch_count_bindless(ctx);
|
||||
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
|
||||
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
|
||||
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
|
||||
@ -377,10 +376,15 @@ impl ComputepassState {
|
||||
}
|
||||
}
|
||||
|
||||
fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
|
||||
fn run_subpass(
|
||||
&self,
|
||||
ctx: &BenchmarkContext,
|
||||
pass_number: usize,
|
||||
total_passes: usize,
|
||||
) -> wgpu::CommandBuffer {
|
||||
profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));
|
||||
|
||||
let dispatch_count = dispatch_count();
|
||||
let dispatch_count = dispatch_count(ctx);
|
||||
let dispatch_per_pass = dispatch_count / total_passes;
|
||||
|
||||
let mut encoder = self
|
||||
@ -431,183 +435,140 @@ impl ComputepassState {
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench(ctx: &mut Criterion) {
|
||||
let state = LazyLock::new(ComputepassState::new);
|
||||
pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
|
||||
let state = ComputepassState::new(&ctx);
|
||||
|
||||
let dispatch_count = dispatch_count();
|
||||
let dispatch_count_bindless = dispatch_count_bindless();
|
||||
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
|
||||
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
|
||||
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
|
||||
ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
|
||||
}
|
||||
|
||||
let dispatch_count = dispatch_count(&ctx);
|
||||
let dispatch_count_bindless = dispatch_count_bindless(&ctx);
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
|
||||
let mut group = ctx.benchmark_group("Computepass: Single Threaded");
|
||||
group.throughput(Throughput::Elements(dispatch_count as _));
|
||||
for &cpasses in thread_count_list(&ctx) {
|
||||
let labels = vec![
|
||||
format!("Encoding ({cpasses} passes)"),
|
||||
format!("Submit ({cpasses} passes)"),
|
||||
];
|
||||
|
||||
for time_submit in [false, true] {
|
||||
for &cpasses in thread_count_list() {
|
||||
let dispatch_per_pass = dispatch_count / cpasses;
|
||||
results.extend(iter_many(
|
||||
&ctx,
|
||||
labels,
|
||||
"dispatches",
|
||||
dispatch_count as _,
|
||||
|| {
|
||||
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
|
||||
let encoding_start = Instant::now();
|
||||
for i in 0..cpasses {
|
||||
buffers.push(state.run_subpass(&ctx, i, cpasses));
|
||||
}
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
let label = if time_submit {
|
||||
"Submit Time"
|
||||
} else {
|
||||
"Computepass Time"
|
||||
};
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit(buffers);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
group.bench_function(
|
||||
format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let mut start = Instant::now();
|
||||
|
||||
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
|
||||
for i in 0..cpasses {
|
||||
buffers.push(state.run_subpass(i, cpasses));
|
||||
}
|
||||
|
||||
if time_submit {
|
||||
start = Instant::now();
|
||||
} else {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
|
||||
if time_submit {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
|
||||
group.throughput(Throughput::Elements(dispatch_count as _));
|
||||
|
||||
for &threads in thread_count_list() {
|
||||
let dispatch_per_pass = dispatch_count / threads;
|
||||
group.bench_function(
|
||||
format!("{threads} threads x {dispatch_per_pass} dispatch"),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|i| state.run_subpass(i, threads))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Computepass: Bindless");
|
||||
group.throughput(Throughput::Elements(dispatch_count_bindless as _));
|
||||
|
||||
group.bench_function(format!("{dispatch_count_bindless} dispatch"), |b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
|
||||
// Need bindless to run this benchmark
|
||||
if state.bindless_bind_group.is_none() {
|
||||
return Duration::from_secs(1);
|
||||
}
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffer = state.run_bindless_pass(dispatch_count_bindless);
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit([buffer]);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
vec![encoding_duration, submit_duration]
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
ctx.bench_function(
|
||||
// Test 10k dispatch calls split up over 2, 4, and 8 threads.
|
||||
for &threads in thread_count_list(&ctx) {
|
||||
let labels = vec![
|
||||
format!("Encoding ({threads} threads)"),
|
||||
format!("Submit ({threads} threads)"),
|
||||
];
|
||||
|
||||
results.extend(iter_many(
|
||||
&ctx,
|
||||
labels,
|
||||
"dispatches",
|
||||
dispatch_count as _,
|
||||
|| {
|
||||
let encoding_start = Instant::now();
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|i| state.run_subpass(&ctx, i, threads))
|
||||
.collect::<Vec<_>>();
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit(buffers);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
vec![encoding_duration, submit_duration]
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
// Test 10k dispatch calls with bindless rendering.
|
||||
if state.bindless_bind_group.is_some() {
|
||||
let labels = vec![
|
||||
"Encoding (bindless)".to_string(),
|
||||
"Submit (bindless)".to_string(),
|
||||
];
|
||||
|
||||
results.extend(iter_many(
|
||||
&ctx,
|
||||
labels,
|
||||
"dispatches",
|
||||
dispatch_count_bindless as _,
|
||||
|| {
|
||||
let encoding_start = Instant::now();
|
||||
let buffer = state.run_bindless_pass(dispatch_count_bindless);
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit([buffer]);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
vec![encoding_duration, submit_duration]
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
// Test empty submit overhead with all resources
|
||||
let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
|
||||
let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
|
||||
let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
|
||||
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
&format!(
|
||||
"Computepass: Empty Submit with {} Resources",
|
||||
"Empty Submit with {} Resources",
|
||||
texture_count + storage_texture_count + storage_buffer_count
|
||||
),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter(|| state.device_state.queue.submit([]));
|
||||
"submits",
|
||||
1,
|
||||
|| {
|
||||
state.device_state.queue.submit([]);
|
||||
},
|
||||
);
|
||||
}
|
||||
));
|
||||
|
||||
criterion_group! {
|
||||
name = computepass;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||
targets = run_bench,
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
use criterion::criterion_main;
|
||||
#[cfg_attr(target_arch = "wasm32", no_main)]
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
use pollster::block_on;
|
||||
use wgpu_benchmark::Benchmark;
|
||||
|
||||
mod bind_groups;
|
||||
mod computepass;
|
||||
@ -7,10 +9,6 @@ mod renderpass;
|
||||
mod resource_creation;
|
||||
mod shader;
|
||||
|
||||
fn is_test() -> bool {
|
||||
std::env::var("NEXTEST").is_ok()
|
||||
}
|
||||
|
||||
struct DeviceState {
|
||||
adapter_info: wgpu::AdapterInfo,
|
||||
device: wgpu::Device,
|
||||
@ -41,14 +39,17 @@ impl DeviceState {
|
||||
|
||||
let adapter_info = adapter.get_info();
|
||||
|
||||
eprintln!("{adapter_info:?}");
|
||||
println!(
|
||||
" Using adapter: {} ({:?})",
|
||||
adapter_info.name, adapter_info.backend
|
||||
);
|
||||
|
||||
let (device, queue) = block_on(adapter.request_device(&wgpu::DeviceDescriptor {
|
||||
required_features: adapter.features(),
|
||||
required_limits: adapter.limits(),
|
||||
memory_hints: wgpu::MemoryHints::Performance,
|
||||
experimental_features: unsafe { wgpu::ExperimentalFeatures::enabled() },
|
||||
label: Some("Compute/RenderPass Device"),
|
||||
label: None,
|
||||
trace: wgpu::Trace::Off,
|
||||
}))
|
||||
.unwrap();
|
||||
@ -61,10 +62,41 @@ impl DeviceState {
|
||||
}
|
||||
}
|
||||
|
||||
criterion_main!(
|
||||
bind_groups::bind_groups,
|
||||
renderpass::renderpass,
|
||||
computepass::computepass,
|
||||
resource_creation::resource_creation,
|
||||
shader::shader
|
||||
);
|
||||
fn main() {
|
||||
let benchmarks = vec![
|
||||
Benchmark {
|
||||
name: "Device::create_bind_group",
|
||||
func: bind_groups::run_bench,
|
||||
},
|
||||
Benchmark {
|
||||
name: "Device::create_buffer",
|
||||
func: resource_creation::run_bench,
|
||||
},
|
||||
Benchmark {
|
||||
name: "naga::front",
|
||||
func: shader::frontends,
|
||||
},
|
||||
Benchmark {
|
||||
name: "naga::valid",
|
||||
func: shader::validation,
|
||||
},
|
||||
Benchmark {
|
||||
name: "naga::compact",
|
||||
func: shader::compact,
|
||||
},
|
||||
Benchmark {
|
||||
name: "naga::back",
|
||||
func: shader::backends,
|
||||
},
|
||||
Benchmark {
|
||||
name: "Renderpass Encoding",
|
||||
func: renderpass::run_bench,
|
||||
},
|
||||
Benchmark {
|
||||
name: "Computepass Encoding",
|
||||
func: computepass::run_bench,
|
||||
},
|
||||
];
|
||||
|
||||
wgpu_benchmark::main(benchmarks);
|
||||
}
|
||||
|
||||
@ -3,34 +3,33 @@ use std::{
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use nanorand::{Rng, WyRand};
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use std::sync::LazyLock;
|
||||
use wgpu_benchmark::{iter_many, BenchmarkContext, LoopControl};
|
||||
|
||||
use crate::{is_test, DeviceState};
|
||||
use crate::DeviceState;
|
||||
|
||||
fn draw_count() -> usize {
|
||||
fn draw_count(ctx: &BenchmarkContext) -> u32 {
|
||||
// When testing we only want to run a very lightweight version of the benchmark
|
||||
// to ensure that it does not break.
|
||||
if is_test() {
|
||||
if ctx.is_test() {
|
||||
8
|
||||
} else {
|
||||
10_000
|
||||
}
|
||||
}
|
||||
|
||||
fn thread_count_list() -> &'static [usize] {
|
||||
if is_test() {
|
||||
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [u32] {
|
||||
if ctx.is_test() {
|
||||
&[2]
|
||||
} else {
|
||||
&[1, 2, 4, 8]
|
||||
&[1, 2, 4]
|
||||
}
|
||||
}
|
||||
|
||||
// Must match the number of textures in the renderpass.wgsl shader
|
||||
const TEXTURES_PER_DRAW: usize = 7;
|
||||
const VERTEX_BUFFERS_PER_DRAW: usize = 2;
|
||||
const TEXTURES_PER_DRAW: u32 = 7;
|
||||
const VERTEX_BUFFERS_PER_DRAW: u32 = 2;
|
||||
|
||||
struct RenderpassState {
|
||||
device_state: DeviceState,
|
||||
@ -47,10 +46,10 @@ struct RenderpassState {
|
||||
|
||||
impl RenderpassState {
|
||||
/// Create and prepare all the resources needed for the renderpass benchmark.
|
||||
fn new() -> Self {
|
||||
fn new(ctx: &BenchmarkContext) -> Self {
|
||||
let device_state = DeviceState::new();
|
||||
|
||||
let draw_count = draw_count();
|
||||
let draw_count = draw_count(ctx);
|
||||
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
|
||||
let texture_count = draw_count * TEXTURES_PER_DRAW;
|
||||
|
||||
@ -69,10 +68,10 @@ impl RenderpassState {
|
||||
// well defined usage order.
|
||||
let mut random = WyRand::new_seed(0x8BADF00D);
|
||||
|
||||
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
|
||||
let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
|
||||
for i in 0..TEXTURES_PER_DRAW {
|
||||
bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
|
||||
binding: i as u32,
|
||||
binding: i,
|
||||
visibility: wgpu::ShaderStages::FRAGMENT,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float { filterable: true },
|
||||
@ -91,7 +90,7 @@ impl RenderpassState {
|
||||
entries: &bind_group_layout_entries,
|
||||
});
|
||||
|
||||
let mut texture_views = Vec::with_capacity(texture_count);
|
||||
let mut texture_views = Vec::with_capacity(texture_count as usize);
|
||||
for i in 0..texture_count {
|
||||
let texture = device_state
|
||||
.device
|
||||
@ -118,14 +117,14 @@ impl RenderpassState {
|
||||
|
||||
let texture_view_refs: Vec<_> = texture_views.iter().collect();
|
||||
|
||||
let mut bind_groups = Vec::with_capacity(draw_count);
|
||||
let mut bind_groups = Vec::with_capacity(draw_count as usize);
|
||||
for draw_idx in 0..draw_count {
|
||||
let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
|
||||
let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
|
||||
for tex_idx in 0..TEXTURES_PER_DRAW {
|
||||
entries.push(wgpu::BindGroupEntry {
|
||||
binding: tex_idx as u32,
|
||||
binding: tex_idx,
|
||||
resource: wgpu::BindingResource::TextureView(
|
||||
&texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
|
||||
&texture_views[(draw_idx * TEXTURES_PER_DRAW + tex_idx) as usize],
|
||||
),
|
||||
});
|
||||
}
|
||||
@ -155,7 +154,7 @@ impl RenderpassState {
|
||||
push_constant_ranges: &[],
|
||||
});
|
||||
|
||||
let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count);
|
||||
let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count as usize);
|
||||
for _ in 0..vertex_buffer_count {
|
||||
vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: None,
|
||||
@ -166,7 +165,7 @@ impl RenderpassState {
|
||||
}
|
||||
random.shuffle(&mut vertex_buffers);
|
||||
|
||||
let mut index_buffers = Vec::with_capacity(draw_count);
|
||||
let mut index_buffers = Vec::with_capacity(draw_count as usize);
|
||||
for _ in 0..draw_count {
|
||||
index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: None,
|
||||
@ -177,12 +176,12 @@ impl RenderpassState {
|
||||
}
|
||||
random.shuffle(&mut index_buffers);
|
||||
|
||||
let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
|
||||
let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
|
||||
for i in 0..VERTEX_BUFFERS_PER_DRAW {
|
||||
vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
|
||||
vertex_buffer_attributes.push(wgpu::vertex_attr_array![i => Float32x4]);
|
||||
}
|
||||
|
||||
let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
|
||||
let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
|
||||
for attributes in &vertex_buffer_attributes {
|
||||
vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
|
||||
array_stride: 16,
|
||||
@ -263,7 +262,7 @@ impl RenderpassState {
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: Some(NonZeroU32::new(texture_count as u32).unwrap()),
|
||||
count: Some(NonZeroU32::new(texture_count).unwrap()),
|
||||
}],
|
||||
});
|
||||
|
||||
@ -343,9 +342,9 @@ impl RenderpassState {
|
||||
|
||||
fn run_subpass(
|
||||
&self,
|
||||
pass_number: usize,
|
||||
total_passes: usize,
|
||||
draw_count: usize,
|
||||
pass_number: u32,
|
||||
total_passes: u32,
|
||||
draw_count: u32,
|
||||
) -> wgpu::CommandBuffer {
|
||||
profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
|
||||
|
||||
@ -377,15 +376,16 @@ impl RenderpassState {
|
||||
let end_idx = start_idx + draws_per_pass;
|
||||
for draw_idx in start_idx..end_idx {
|
||||
render_pass.set_pipeline(&self.pipeline);
|
||||
render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
|
||||
render_pass.set_bind_group(0, &self.bind_groups[draw_idx as usize], &[]);
|
||||
for i in 0..VERTEX_BUFFERS_PER_DRAW {
|
||||
render_pass.set_vertex_buffer(
|
||||
i as u32,
|
||||
self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
|
||||
i,
|
||||
self.vertex_buffers[(draw_idx * VERTEX_BUFFERS_PER_DRAW + i) as usize]
|
||||
.slice(..),
|
||||
);
|
||||
}
|
||||
render_pass.set_index_buffer(
|
||||
self.index_buffers[draw_idx].slice(..),
|
||||
self.index_buffers[draw_idx as usize].slice(..),
|
||||
wgpu::IndexFormat::Uint32,
|
||||
);
|
||||
render_pass.draw_indexed(0..3, 0, 0..1);
|
||||
@ -396,7 +396,7 @@ impl RenderpassState {
|
||||
encoder.finish()
|
||||
}
|
||||
|
||||
fn run_bindless_pass(&self, draw_count: usize) -> wgpu::CommandBuffer {
|
||||
fn run_bindless_pass(&self, draw_count: u32) -> wgpu::CommandBuffer {
|
||||
profiling::scope!("Bindless Renderpass");
|
||||
|
||||
let mut encoder = self
|
||||
@ -424,12 +424,12 @@ impl RenderpassState {
|
||||
render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
|
||||
render_pass.set_bind_group(0, Some(self.bindless_bind_group.as_ref().unwrap()), &[]);
|
||||
for i in 0..VERTEX_BUFFERS_PER_DRAW {
|
||||
render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
|
||||
render_pass.set_vertex_buffer(i, self.vertex_buffers[0].slice(..));
|
||||
}
|
||||
render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
|
||||
|
||||
for draw_idx in 0..draw_count {
|
||||
render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
|
||||
render_pass.draw_indexed(0..3, 0, draw_idx..draw_idx + 1);
|
||||
}
|
||||
|
||||
drop(render_pass);
|
||||
@ -438,178 +438,103 @@ impl RenderpassState {
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench(ctx: &mut Criterion) {
|
||||
let state = LazyLock::new(RenderpassState::new);
|
||||
pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
|
||||
let state = RenderpassState::new(&ctx);
|
||||
|
||||
let draw_count = draw_count();
|
||||
let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
|
||||
let texture_count = draw_count * TEXTURES_PER_DRAW;
|
||||
ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
|
||||
}
|
||||
|
||||
let draw_count = draw_count(&ctx);
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
|
||||
let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
|
||||
group.throughput(Throughput::Elements(draw_count as _));
|
||||
for &rpasses in thread_count_list(&ctx) {
|
||||
let labels = vec![
|
||||
format!("Encoding ({rpasses} passes)"),
|
||||
format!("Submit ({rpasses} passes)"),
|
||||
];
|
||||
|
||||
for time_submit in [false, true] {
|
||||
for &rpasses in thread_count_list() {
|
||||
let draws_per_pass = draw_count / rpasses;
|
||||
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
|
||||
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses as usize);
|
||||
let encoding_start = Instant::now();
|
||||
for i in 0..rpasses {
|
||||
buffers.push(state.run_subpass(i, rpasses, draw_count));
|
||||
}
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
let label = if time_submit {
|
||||
"Submit Time"
|
||||
} else {
|
||||
"Renderpass Time"
|
||||
};
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit(buffers);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
group.bench_function(
|
||||
format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs(1);
|
||||
}
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let mut start = Instant::now();
|
||||
|
||||
let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
|
||||
for i in 0..rpasses {
|
||||
buffers.push(state.run_subpass(i, rpasses, draw_count));
|
||||
}
|
||||
|
||||
if time_submit {
|
||||
start = Instant::now();
|
||||
} else {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
|
||||
if time_submit {
|
||||
duration += start.elapsed();
|
||||
}
|
||||
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
vec![encoding_duration, submit_duration]
|
||||
}));
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k draw calls split up over 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
|
||||
group.throughput(Throughput::Elements(draw_count as _));
|
||||
for &threads in thread_count_list(&ctx) {
|
||||
let labels = vec![
|
||||
format!("Encoding ({threads} threads)"),
|
||||
format!("Submit ({threads} threads)"),
|
||||
];
|
||||
|
||||
for &threads in thread_count_list() {
|
||||
let draws_per_pass = draw_count / threads;
|
||||
group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
|
||||
LazyLock::force(&state);
|
||||
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
|
||||
let encoding_start = Instant::now();
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|i| state.run_subpass(i, threads, draw_count))
|
||||
.collect::<Vec<_>>();
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit(buffers);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
// This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
|
||||
if state.device_state.adapter_info.name.contains("Paravirtual") {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|i| state.run_subpass(i, threads, draw_count))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit(buffers);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
});
|
||||
vec![encoding_duration, submit_duration]
|
||||
}));
|
||||
}
|
||||
group.finish();
|
||||
|
||||
// Test 10k draw calls split up over 1, 2, 4, and 8 threads.
|
||||
let mut group = ctx.benchmark_group("Renderpass: Bindless");
|
||||
group.throughput(Throughput::Elements(draw_count as _));
|
||||
// Test 10k draw calls with bindless rendering.
|
||||
if state.bindless_bind_group.is_some() {
|
||||
let labels = vec![
|
||||
"Encoding (bindless)".to_string(),
|
||||
"Submit (bindless)".to_string(),
|
||||
];
|
||||
|
||||
group.bench_function(format!("{draw_count} draws"), |b| {
|
||||
LazyLock::force(&state);
|
||||
results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
|
||||
let encoding_start = Instant::now();
|
||||
let buffer = state.run_bindless_pass(draw_count);
|
||||
let encoding_duration = encoding_start.elapsed();
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
let submit_start = Instant::now();
|
||||
state.device_state.queue.submit([buffer]);
|
||||
let submit_duration = submit_start.elapsed();
|
||||
|
||||
// Need bindless to run this benchmark
|
||||
if state.bindless_bind_group.is_none() {
|
||||
return Duration::from_secs_f32(1.0);
|
||||
}
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
vec![encoding_duration, submit_duration]
|
||||
}));
|
||||
}
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let buffer = state.run_bindless_pass(draw_count);
|
||||
|
||||
duration += start.elapsed();
|
||||
|
||||
state.device_state.queue.submit([buffer]);
|
||||
state
|
||||
.device_state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
|
||||
ctx.bench_function(
|
||||
&format!(
|
||||
"Renderpass: Empty Submit with {} Resources",
|
||||
texture_count + vertex_buffer_count
|
||||
),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter(|| state.device_state.queue.submit([]));
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = renderpass;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||
targets = run_bench,
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@ -1,82 +1,62 @@
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Instant;
|
||||
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use std::sync::LazyLock;
|
||||
use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};
|
||||
|
||||
use crate::{is_test, DeviceState};
|
||||
use crate::DeviceState;
|
||||
|
||||
fn thread_count_list() -> &'static [usize] {
|
||||
if is_test() {
|
||||
fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
|
||||
if ctx.is_test() {
|
||||
&[2]
|
||||
} else {
|
||||
&[1, 2, 4, 8]
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench(ctx: &mut Criterion) {
|
||||
let state = LazyLock::new(DeviceState::new);
|
||||
pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
let state = DeviceState::new();
|
||||
|
||||
const RESOURCES_TO_CREATE: usize = 8;
|
||||
|
||||
let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
|
||||
group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
|
||||
|
||||
for &threads in thread_count_list() {
|
||||
let mut results = Vec::new();
|
||||
for &threads in thread_count_list(&ctx) {
|
||||
let resources_per_thread = RESOURCES_TO_CREATE / threads;
|
||||
group.bench_function(
|
||||
format!("{threads} threads x {resources_per_thread} resource"),
|
||||
|b| {
|
||||
LazyLock::force(&state);
|
||||
|
||||
b.iter_custom(|iters| {
|
||||
profiling::scope!("benchmark invocation");
|
||||
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
profiling::scope!("benchmark iteration");
|
||||
|
||||
// We can't create too many resources at once, so we do it 8 resources at a time.
|
||||
let start = Instant::now();
|
||||
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
results.push(iter(
|
||||
&ctx,
|
||||
&format!("{threads} threads"),
|
||||
"buffers",
|
||||
RESOURCES_TO_CREATE as u32,
|
||||
|| {
|
||||
let start = Instant::now();
|
||||
let buffers = (0..threads)
|
||||
.into_par_iter()
|
||||
.map(|_| {
|
||||
(0..resources_per_thread)
|
||||
.map(|_| {
|
||||
(0..resources_per_thread)
|
||||
.map(|_| {
|
||||
state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: None,
|
||||
size: 256 * 1024 * 1024,
|
||||
usage: wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
state.device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: None,
|
||||
size: 256 * 1024 * 1024,
|
||||
usage: wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let duration = start.elapsed();
|
||||
|
||||
duration += start.elapsed();
|
||||
drop(buffers);
|
||||
|
||||
drop(buffers);
|
||||
state.queue.submit([]);
|
||||
state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
|
||||
state.queue.submit([]);
|
||||
state
|
||||
.device
|
||||
.poll(wgpu::PollType::wait_indefinitely())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
duration
|
||||
})
|
||||
duration
|
||||
},
|
||||
);
|
||||
));
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = resource_creation;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(10));
|
||||
targets = run_bench,
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
use criterion::*;
|
||||
use std::{fs, process::Command};
|
||||
use wgpu_benchmark::{iter_auto, BenchmarkContext, SubBenchResult};
|
||||
|
||||
const DIR_IN: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../naga/tests/in");
|
||||
|
||||
@ -141,28 +141,32 @@ fn get_wgsl_inputs() -> Inputs {
|
||||
Inputs { inner: inputs }
|
||||
}
|
||||
|
||||
fn frontends(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("front");
|
||||
pub fn frontends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
let mut inputs_wgsl = get_wgsl_inputs();
|
||||
|
||||
group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
|
||||
group.bench_function("shader: naga module bincode decode", |b| {
|
||||
inputs_wgsl.parse();
|
||||
inputs_wgsl.parse();
|
||||
inputs_wgsl.load_utf8();
|
||||
|
||||
let inputs_bin = inputs_wgsl
|
||||
.inner
|
||||
.iter()
|
||||
.map(|input| {
|
||||
bincode::serde::encode_to_vec(
|
||||
input.module.as_ref().unwrap(),
|
||||
bincode::config::standard(),
|
||||
)
|
||||
.unwrap()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let inputs_bin = inputs_wgsl
|
||||
.inner
|
||||
.iter()
|
||||
.map(|input| {
|
||||
bincode::serde::encode_to_vec(
|
||||
input.module.as_ref().unwrap(),
|
||||
bincode::config::standard(),
|
||||
)
|
||||
.unwrap()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
b.iter(move || {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"bincode decode",
|
||||
"bytes",
|
||||
inputs_wgsl.bytes() as u32,
|
||||
move || {
|
||||
for input in inputs_bin.iter() {
|
||||
bincode::serde::decode_from_slice::<naga::Module, _>(
|
||||
input,
|
||||
@ -170,20 +174,23 @@ fn frontends(c: &mut Criterion) {
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
});
|
||||
});
|
||||
},
|
||||
));
|
||||
|
||||
group.bench_function("shader: wgsl-in", |b| {
|
||||
inputs_wgsl.load_utf8();
|
||||
let mut frontend = naga::front::wgsl::Frontend::new();
|
||||
|
||||
let mut frontend = naga::front::wgsl::Frontend::new();
|
||||
b.iter(|| {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"wgsl",
|
||||
"bytes",
|
||||
inputs_wgsl.bytes() as u32,
|
||||
|| {
|
||||
for input in &inputs_wgsl.inner {
|
||||
frontend.set_options((&input.options.wgsl_in).into());
|
||||
frontend.parse(input.string.as_ref().unwrap()).unwrap();
|
||||
}
|
||||
});
|
||||
});
|
||||
},
|
||||
));
|
||||
|
||||
let inputs_spirv = Inputs::from_dir("spv", "spvasm");
|
||||
assert!(!inputs_spirv.is_empty());
|
||||
@ -220,13 +227,16 @@ fn frontends(c: &mut Criterion) {
|
||||
assembled_spirv.push(bytemuck::pod_collect_to_vec(&output.stdout));
|
||||
}
|
||||
|
||||
let total_bytes = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();
|
||||
let total_bytes: u64 = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();
|
||||
|
||||
assert!(assembled_spirv.len() == inputs_spirv.inner.len() || assembled_spirv.is_empty());
|
||||
|
||||
group.throughput(Throughput::Bytes(total_bytes));
|
||||
group.bench_function("shader: spv-in", |b| {
|
||||
b.iter(|| {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"spv parse",
|
||||
"bytes",
|
||||
total_bytes as u32,
|
||||
|| {
|
||||
for (i, input) in assembled_spirv.iter().enumerate() {
|
||||
let params = &inputs_spirv.inner[i].options;
|
||||
let SpirvInParameters {
|
||||
@ -243,140 +253,152 @@ fn frontends(c: &mut Criterion) {
|
||||
);
|
||||
parser.parse().unwrap();
|
||||
}
|
||||
});
|
||||
});
|
||||
},
|
||||
));
|
||||
|
||||
let mut inputs_vertex = Inputs::from_dir("glsl", "vert");
|
||||
let mut inputs_fragment = Inputs::from_dir("glsl", "frag");
|
||||
let mut inputs_compute = Inputs::from_dir("glsl", "comp");
|
||||
assert!(!inputs_vertex.is_empty());
|
||||
assert!(!inputs_fragment.is_empty());
|
||||
// let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
|
||||
group.throughput(Throughput::Bytes(
|
||||
inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
|
||||
));
|
||||
group.bench_function("shader: glsl-in", |b| {
|
||||
inputs_vertex.load();
|
||||
inputs_vertex.load_utf8();
|
||||
inputs_fragment.load_utf8();
|
||||
// inputs_compute.load_utf8();
|
||||
assert!(!inputs_compute.is_empty());
|
||||
|
||||
b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
|
||||
b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
|
||||
// TODO: This one hangs for some reason
|
||||
// b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
|
||||
});
|
||||
inputs_vertex.load_utf8();
|
||||
inputs_fragment.load_utf8();
|
||||
inputs_compute.load_utf8();
|
||||
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"glsl parse",
|
||||
"bytes",
|
||||
(inputs_vertex.bytes() + inputs_fragment.bytes() + inputs_compute.bytes()) as u32,
|
||||
|| {
|
||||
parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex);
|
||||
parse_glsl(naga::ShaderStage::Fragment, &inputs_fragment);
|
||||
parse_glsl(naga::ShaderStage::Compute, &inputs_compute);
|
||||
},
|
||||
));
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn validation(c: &mut Criterion) {
|
||||
pub fn validation(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
let mut inputs = get_wgsl_inputs();
|
||||
|
||||
let mut group = c.benchmark_group("validate");
|
||||
group.throughput(Throughput::Bytes(inputs.bytes()));
|
||||
group.bench_function("shader: validation", |b| {
|
||||
inputs.load();
|
||||
inputs.load_utf8();
|
||||
inputs.parse();
|
||||
inputs.parse();
|
||||
|
||||
let mut validator = naga::valid::Validator::new(
|
||||
naga::valid::ValidationFlags::all(),
|
||||
naga::valid::Capabilities::all(),
|
||||
);
|
||||
validator
|
||||
.subgroup_stages(naga::valid::ShaderStages::all())
|
||||
.subgroup_operations(naga::valid::SubgroupOperationSet::all());
|
||||
b.iter(|| {
|
||||
let mut validator = naga::valid::Validator::new(
|
||||
naga::valid::ValidationFlags::all(),
|
||||
naga::valid::Capabilities::all(),
|
||||
);
|
||||
validator
|
||||
.subgroup_stages(naga::valid::ShaderStages::all())
|
||||
.subgroup_operations(naga::valid::SubgroupOperationSet::all());
|
||||
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"validation",
|
||||
"bytes",
|
||||
inputs.bytes() as u32,
|
||||
|| {
|
||||
for input in &inputs.inner {
|
||||
validator.validate(input.module.as_ref().unwrap()).unwrap();
|
||||
}
|
||||
});
|
||||
});
|
||||
group.finish();
|
||||
},
|
||||
));
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn compact(c: &mut Criterion) {
|
||||
pub fn compact(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
use naga::compact::{compact, KeepUnused};
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
let mut inputs = get_wgsl_inputs();
|
||||
|
||||
inputs.validate();
|
||||
assert!(!inputs.is_empty());
|
||||
|
||||
let mut group = c.benchmark_group("compact");
|
||||
group.throughput(Throughput::Bytes(inputs.bytes()));
|
||||
group.bench_function("shader: compact", |b| {
|
||||
b.iter(|| {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"compact",
|
||||
"bytes",
|
||||
inputs.bytes() as u32,
|
||||
|| {
|
||||
for input in &mut inputs.inner {
|
||||
compact(input.module.as_mut().unwrap(), KeepUnused::No);
|
||||
}
|
||||
});
|
||||
});
|
||||
group.finish();
|
||||
},
|
||||
));
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn backends(c: &mut Criterion) {
|
||||
pub fn backends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
let mut inputs = get_wgsl_inputs();
|
||||
|
||||
let mut group = c.benchmark_group("back");
|
||||
// While normally this would be done inside the bench_function callback, we need to
|
||||
// run this to properly know the size of the inputs, as any that fail validation
|
||||
// will be removed.
|
||||
inputs.validate();
|
||||
assert!(!inputs.is_empty());
|
||||
|
||||
group.throughput(Throughput::Bytes(inputs.bytes()));
|
||||
group.bench_function("shader: wgsl-out", |b| {
|
||||
b.iter(|| {
|
||||
let mut string = String::new();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::WGSL) {
|
||||
let mut writer =
|
||||
naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
|
||||
let total_bytes = inputs.bytes() as u32;
|
||||
|
||||
results.push(iter_auto(&ctx, "wgsl", "bytes", total_bytes, || {
|
||||
let mut string = String::new();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::WGSL) {
|
||||
let mut writer =
|
||||
naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
);
|
||||
string.clear();
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
results.push(iter_auto(&ctx, "spv", "bytes", total_bytes, || {
|
||||
let mut data = Vec::new();
|
||||
let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::SPIRV) {
|
||||
if input.filename().contains("pointer-function-arg") {
|
||||
continue;
|
||||
}
|
||||
let opt = input
|
||||
.options
|
||||
.spv
|
||||
.to_options(input.options.bounds_check_policies, None);
|
||||
if writer.set_options(&opt).is_ok() {
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
None,
|
||||
&None,
|
||||
&mut data,
|
||||
);
|
||||
string.clear();
|
||||
data.clear();
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}));
|
||||
|
||||
group.bench_function("shader: spv-out", |b| {
|
||||
b.iter(|| {
|
||||
let mut data = Vec::new();
|
||||
let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::SPIRV) {
|
||||
if input.filename().contains("pointer-function-arg") {
|
||||
// These fail due to https://github.com/gfx-rs/wgpu/issues/7315
|
||||
continue;
|
||||
}
|
||||
let opt = input
|
||||
.options
|
||||
.spv
|
||||
.to_options(input.options.bounds_check_policies, None);
|
||||
if writer.set_options(&opt).is_ok() {
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
None,
|
||||
&None,
|
||||
&mut data,
|
||||
);
|
||||
data.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
group.bench_function("shader: spv-out multiple entrypoints", |b| {
|
||||
b.iter(|| {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"spv multiple entrypoints",
|
||||
"bytes",
|
||||
total_bytes,
|
||||
|| {
|
||||
let mut data = Vec::new();
|
||||
let options = naga::back::spv::Options::default();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::SPIRV) {
|
||||
if input.filename().contains("pointer-function-arg") {
|
||||
// These fail due to https://github.com/gfx-rs/wgpu/issues/7315
|
||||
continue;
|
||||
}
|
||||
let mut writer = naga::back::spv::Writer::new(&options).unwrap();
|
||||
@ -397,51 +419,51 @@ fn backends(c: &mut Criterion) {
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
},
|
||||
));
|
||||
|
||||
group.bench_function("shader: msl-out", |b| {
|
||||
b.iter(|| {
|
||||
let mut string = String::new();
|
||||
let options = naga::back::msl::Options::default();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::METAL) {
|
||||
let pipeline_options = naga::back::msl::PipelineOptions::default();
|
||||
let mut writer = naga::back::msl::Writer::new(&mut string);
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
&options,
|
||||
&pipeline_options,
|
||||
);
|
||||
string.clear();
|
||||
}
|
||||
results.push(iter_auto(&ctx, "msl", "bytes", total_bytes, || {
|
||||
let mut string = String::new();
|
||||
let options = naga::back::msl::Options::default();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::METAL) {
|
||||
let pipeline_options = naga::back::msl::PipelineOptions::default();
|
||||
let mut writer = naga::back::msl::Writer::new(&mut string);
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
&options,
|
||||
&pipeline_options,
|
||||
);
|
||||
string.clear();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}));
|
||||
|
||||
group.bench_function("shader: hlsl-out", |b| {
|
||||
b.iter(|| {
|
||||
let options = naga::back::hlsl::Options::default();
|
||||
let mut string = String::new();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::HLSL) {
|
||||
let pipeline_options = Default::default();
|
||||
let mut writer =
|
||||
naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
None,
|
||||
); // may fail on unimplemented things
|
||||
string.clear();
|
||||
}
|
||||
results.push(iter_auto(&ctx, "hlsl", "bytes", total_bytes, || {
|
||||
let options = naga::back::hlsl::Options::default();
|
||||
let mut string = String::new();
|
||||
for input in &inputs.inner {
|
||||
if input.options.targets.unwrap().contains(Targets::HLSL) {
|
||||
let pipeline_options = Default::default();
|
||||
let mut writer =
|
||||
naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
|
||||
let _ = writer.write(
|
||||
input.module.as_ref().unwrap(),
|
||||
input.module_info.as_ref().unwrap(),
|
||||
None,
|
||||
);
|
||||
string.clear();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}));
|
||||
|
||||
group.bench_function("shader: glsl-out multiple entrypoints", |b| {
|
||||
b.iter(|| {
|
||||
results.push(iter_auto(
|
||||
&ctx,
|
||||
"glsl multiple entrypoints",
|
||||
"bytes",
|
||||
total_bytes,
|
||||
|| {
|
||||
let mut string = String::new();
|
||||
let options = naga::back::glsl::Options {
|
||||
version: naga::back::glsl::Version::new_gles(320),
|
||||
@ -462,7 +484,6 @@ fn backends(c: &mut Criterion) {
|
||||
multiview: None,
|
||||
};
|
||||
|
||||
// might be `Err` if missing features
|
||||
if let Ok(mut writer) = naga::back::glsl::Writer::new(
|
||||
&mut string,
|
||||
module,
|
||||
@ -471,14 +492,14 @@ fn backends(c: &mut Criterion) {
|
||||
&pipeline_options,
|
||||
naga::proc::BoundsCheckPolicies::default(),
|
||||
) {
|
||||
let _ = writer.write(); // might be `Err` if unsupported
|
||||
let _ = writer.write();
|
||||
}
|
||||
|
||||
string.clear();
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
},
|
||||
));
|
||||
|
||||
criterion_group!(shader, frontends, validation, compact, backends);
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
34
benches/src/context.rs
Normal file
34
benches/src/context.rs
Normal file
@ -0,0 +1,34 @@
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum LoopControl {
|
||||
Iterations(u32),
|
||||
Time(Duration),
|
||||
}
|
||||
|
||||
impl Default for LoopControl {
|
||||
fn default() -> Self {
|
||||
LoopControl::Time(Duration::from_secs(2))
|
||||
}
|
||||
}
|
||||
|
||||
impl LoopControl {
|
||||
pub(crate) fn finished(&self, iterations: u32, elapsed: Duration) -> bool {
|
||||
match self {
|
||||
LoopControl::Iterations(target) => iterations >= *target,
|
||||
LoopControl::Time(target) => elapsed >= *target,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BenchmarkContext {
|
||||
pub(crate) override_iters: Option<LoopControl>,
|
||||
pub default_iterations: LoopControl,
|
||||
pub(crate) is_test: bool,
|
||||
}
|
||||
|
||||
impl BenchmarkContext {
|
||||
pub fn is_test(&self) -> bool {
|
||||
self.is_test
|
||||
}
|
||||
}
|
||||
27
benches/src/file.rs
Normal file
27
benches/src/file.rs
Normal file
@ -0,0 +1,27 @@
|
||||
use anyhow::Context as _;
|
||||
|
||||
use crate::BenchmarkFile;
|
||||
|
||||
const FILE_PREFIX: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../target/bench/");
|
||||
pub const PREVIOUS: &str = "previous";
|
||||
|
||||
pub(crate) fn get_comparison_file(baseline: Option<&str>) -> Option<BenchmarkFile> {
|
||||
let file_name = baseline.unwrap_or(PREVIOUS);
|
||||
let path = format!("{FILE_PREFIX}{file_name}.json");
|
||||
|
||||
let file = std::fs::read_to_string(path).ok()?;
|
||||
let benchmark_file: BenchmarkFile = serde_json::from_str(&file).ok()?;
|
||||
Some(benchmark_file)
|
||||
}
|
||||
|
||||
pub(crate) fn write_results_file(
|
||||
file_name: &str,
|
||||
output_file: &BenchmarkFile,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = format!("{FILE_PREFIX}{file_name}.json");
|
||||
let json = serde_json::to_string_pretty(output_file)?;
|
||||
std::fs::create_dir_all(FILE_PREFIX)
|
||||
.with_context(|| format!("Trying to create directory {FILE_PREFIX}"))?;
|
||||
std::fs::write(&path, json).with_context(|| format!("Trying to write file {path}"))?;
|
||||
Ok(())
|
||||
}
|
||||
97
benches/src/iter.rs
Normal file
97
benches/src/iter.rs
Normal file
@ -0,0 +1,97 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::{BenchmarkContext, LoopControl, SubBenchResult};
|
||||
|
||||
pub fn iter(
|
||||
ctx: &BenchmarkContext,
|
||||
name: &str,
|
||||
throughput_unit: &str,
|
||||
throughput_count_per_iteration: u32,
|
||||
mut f: impl FnMut() -> Duration,
|
||||
) -> SubBenchResult {
|
||||
profiling::scope!("iter", name);
|
||||
|
||||
let mut iterations = 0_u32;
|
||||
let mut duration = Duration::ZERO;
|
||||
|
||||
let control = if let Some(override_control) = ctx.override_iters {
|
||||
override_control
|
||||
} else {
|
||||
ctx.default_iterations
|
||||
};
|
||||
|
||||
while !control.finished(iterations, duration) {
|
||||
duration += f();
|
||||
iterations += 1;
|
||||
}
|
||||
|
||||
SubBenchResult {
|
||||
name: name.to_string(),
|
||||
avg_duration_per_iteration: duration / iterations,
|
||||
iterations,
|
||||
throughput_unit: throughput_unit.to_string(),
|
||||
throughput_count_per_iteration,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_auto(
|
||||
ctx: &BenchmarkContext,
|
||||
name: &str,
|
||||
throughput_unit: &str,
|
||||
throughput_count_per_iteration: u32,
|
||||
mut f: impl FnMut(),
|
||||
) -> SubBenchResult {
|
||||
iter(
|
||||
ctx,
|
||||
name,
|
||||
throughput_unit,
|
||||
throughput_count_per_iteration,
|
||||
|| {
|
||||
let start = std::time::Instant::now();
|
||||
f();
|
||||
start.elapsed()
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn iter_many(
|
||||
ctx: &BenchmarkContext,
|
||||
names: Vec<String>,
|
||||
throughput_unit: &str,
|
||||
throughput_count_per_iteration: u32,
|
||||
mut f: impl FnMut() -> Vec<Duration>,
|
||||
) -> Vec<SubBenchResult> {
|
||||
profiling::scope!("iter", &*names[0]);
|
||||
|
||||
let mut iterations = 0_u32;
|
||||
let mut durations = vec![Duration::ZERO; names.len()];
|
||||
|
||||
let control = if let Some(override_control) = ctx.override_iters {
|
||||
override_control
|
||||
} else {
|
||||
LoopControl::Time(Duration::from_secs(1))
|
||||
};
|
||||
|
||||
// We use the first duration to determine whether to stop. This means the other sub-benchmarks
|
||||
// could have run for longer or shorter than intended, but that's acceptable.
|
||||
while !control.finished(iterations, *durations.first().unwrap_or(&Duration::ZERO)) {
|
||||
let iteration_durations = f();
|
||||
assert_eq!(iteration_durations.len(), names.len());
|
||||
for (i, dur) in iteration_durations.into_iter().enumerate() {
|
||||
durations[i] += dur;
|
||||
}
|
||||
iterations += 1;
|
||||
}
|
||||
|
||||
durations
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, d)| SubBenchResult {
|
||||
name: names[i].to_string(),
|
||||
avg_duration_per_iteration: d / iterations,
|
||||
iterations,
|
||||
throughput_unit: throughput_unit.to_string(),
|
||||
throughput_count_per_iteration,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
261
benches/src/lib.rs
Normal file
261
benches/src/lib.rs
Normal file
@ -0,0 +1,261 @@
|
||||
#![cfg(not(target_arch = "wasm32"))]
|
||||
#![expect(clippy::disallowed_types)] // We're outside of the main wgpu codebase
|
||||
|
||||
//! Benchmarking framework for `wgpu`.
|
||||
//!
|
||||
//! This crate is a basic framework for benchmarking. Its design is guided
|
||||
//! by a few goals:
|
||||
//!
|
||||
//! - Enumerating tests should be extremely cheap. `criterion` needs
|
||||
//! to run all of your benchmark functions to enumerate them during
|
||||
//! testing. This requires your code to contort itself to avoid doing
|
||||
//! any work until you enter a benchmark callback. This framework
|
||||
//! avoids that by having an explicit list of benchmark function.
|
||||
//! - It must be compatible with `cargo-nextest` and have a compatible
|
||||
//! "test" mode that runs each benchmark exactly once.
|
||||
//! - It should be able to have intuitive test grouping, allowing for
|
||||
//! allowing for quick execution of a reasonable baseline set of benchmarks
|
||||
//! during development, while still allowing for a more exhaustive
|
||||
//! benchmark suite to be run if desired.
|
||||
//!
|
||||
//! By default all tests run for 2 seconds, but this can be overridden
|
||||
//! by individual tests.
|
||||
|
||||
use std::{collections::HashMap, io::IsTerminal, time::Duration};
|
||||
|
||||
use anyhow::Result;
|
||||
use pico_args::Arguments;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
|
||||
mod context;
|
||||
mod file;
|
||||
mod iter;
|
||||
mod print;
|
||||
|
||||
pub use context::*;
|
||||
pub use iter::*;
|
||||
|
||||
use crate::file::PREVIOUS;
|
||||
|
||||
#[derive(Serialize, Deserialize, Default)]
|
||||
pub struct BenchmarkFile {
|
||||
pub results: HashMap<String, Vec<SubBenchResult>>,
|
||||
}
|
||||
|
||||
impl BenchmarkFile {
|
||||
pub fn get_result(
|
||||
&self,
|
||||
benchmark_name: &str,
|
||||
sub_benchmark_name: &str,
|
||||
) -> Option<&SubBenchResult> {
|
||||
self.results
|
||||
.get(benchmark_name)?
|
||||
.iter()
|
||||
.find(|r| r.name == sub_benchmark_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SubBenchResult {
|
||||
/// Name of the subbenchmark.
|
||||
pub name: String,
|
||||
/// Average duration per iteration of the subbenchmark.
|
||||
pub avg_duration_per_iteration: Duration,
|
||||
/// Total number of iterations executed.
|
||||
pub iterations: u32,
|
||||
/// Throughput unit description. e.g., "bytes", "elements", etc.
|
||||
pub throughput_unit: String,
|
||||
/// Number of throughput units processed per iteration.
|
||||
pub throughput_count_per_iteration: u32,
|
||||
}
|
||||
|
||||
impl SubBenchResult {
|
||||
pub fn throughput_per_second(&self) -> f64 {
|
||||
let secs_f64 = self.avg_duration_per_iteration.as_secs_f64();
|
||||
if secs_f64 == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.throughput_count_per_iteration as f64 / secs_f64
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Benchmark {
|
||||
pub name: &'static str,
|
||||
pub func: fn(BenchmarkContext) -> Result<Vec<SubBenchResult>>,
|
||||
}
|
||||
|
||||
const HELP: &str = "\
|
||||
Usage: wgpu-benchmark [OPTIONS] [BENCHMARK_NAME]
|
||||
|
||||
Modes:
|
||||
--bench Run in benchmark mode, comparing against previous results.
|
||||
--list List available benchmarks.
|
||||
<no flag> Run in test mode, executing each benchmark exactly once.
|
||||
|
||||
Test Matching:
|
||||
--exact When specifying BENCHMARK_NAME, only run exact matches.
|
||||
BENCHMARK_NAME Only run benchmarks whose names contain this substring.
|
||||
|
||||
Comparison:
|
||||
-b, --baseline NAME Specify a baseline file for comparison.
|
||||
-s, --save-baseline NAME Save the results as a baseline file.
|
||||
|
||||
Timings:
|
||||
--iters N Override number of iterations per benchmark.
|
||||
--time SECONDS Override time per benchmark in seconds.
|
||||
|
||||
Other:
|
||||
--color Set colored output (always,always-ansi,auto,never).
|
||||
--format terse Specify --list output format (only 'terse' is supported).
|
||||
--no-capture (Ignored)
|
||||
";
|
||||
|
||||
pub fn main(benchmarks: Vec<Benchmark>) {
|
||||
let mut args = Arguments::from_env();
|
||||
|
||||
let help = args.contains(["-h", "--help"]);
|
||||
|
||||
if help {
|
||||
println!("{HELP}");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut color: ColorChoice = args
|
||||
.opt_value_from_str("--color")
|
||||
.unwrap_or(None)
|
||||
.unwrap_or(ColorChoice::Auto);
|
||||
if color == ColorChoice::Auto && !std::io::stdin().is_terminal() {
|
||||
color = ColorChoice::Never;
|
||||
}
|
||||
|
||||
let exact = args.contains("--exact");
|
||||
// We don't actually need this flag, but cargo-nextest passes it in
|
||||
// test mode, so we need to accept it.
|
||||
let _no_capture = args.contains("--no-capture");
|
||||
|
||||
#[expect(clippy::manual_map)] // So much clearer this way
|
||||
let mut override_iterations = if let Some(iters) = args.opt_value_from_str("--iters").unwrap() {
|
||||
Some(LoopControl::Iterations(iters))
|
||||
} else if let Some(seconds) = args.opt_value_from_str("--time").unwrap() {
|
||||
Some(LoopControl::Time(Duration::from_secs_f64(seconds)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let baseline_name: Option<String> = args.opt_value_from_str(["-b", "--baseline"]).unwrap();
|
||||
let write_baseline: Option<String> =
|
||||
args.opt_value_from_str(["-s", "--save-baseline"]).unwrap();
|
||||
|
||||
let is_bench = args.contains("--bench");
|
||||
let is_list = args.contains("--list");
|
||||
let is_test = !is_bench && !is_list;
|
||||
|
||||
let format: Option<String> = args.opt_value_from_str("--format").unwrap();
|
||||
|
||||
if let Some(fmt) = format {
|
||||
assert_eq!(fmt, "terse", "Only 'terse' format is supported.");
|
||||
}
|
||||
if let Some(ref baseline) = baseline_name {
|
||||
if baseline == PREVIOUS {
|
||||
eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
if let Some(ref write_baseline) = write_baseline {
|
||||
if write_baseline == PREVIOUS {
|
||||
eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if override_iterations.is_none() && is_test {
|
||||
override_iterations = Some(LoopControl::Iterations(1));
|
||||
}
|
||||
|
||||
let name = args.free_from_str::<String>().ok();
|
||||
|
||||
let baseline = if is_bench {
|
||||
let res = file::get_comparison_file(baseline_name.as_deref());
|
||||
|
||||
match (&res, baseline_name.as_deref()) {
|
||||
(Some(_), Some(baseline)) => {
|
||||
println!("Using baseline \"{baseline}\" for comparison.\n")
|
||||
}
|
||||
(None, Some(baseline)) => {
|
||||
eprintln!("Could not find baseline named {baseline:?}.\n");
|
||||
return;
|
||||
}
|
||||
(Some(_), None) => {
|
||||
println!("Using previous benchmark results for comparison.\n");
|
||||
}
|
||||
(None, None) => {
|
||||
println!("No previous benchmark results found for comparison.\n");
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut output_file = BenchmarkFile::default();
|
||||
|
||||
let mut stdout = StandardStream::stdout(color);
|
||||
|
||||
for bench in benchmarks {
|
||||
if let Some(ref bench_name) = name {
|
||||
if exact {
|
||||
if bench.name != bench_name {
|
||||
continue;
|
||||
}
|
||||
} else if !bench.name.contains(bench_name) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if is_list {
|
||||
println!("{}: benchmark", bench.name);
|
||||
continue;
|
||||
}
|
||||
|
||||
let ctx = BenchmarkContext {
|
||||
override_iters: override_iterations,
|
||||
default_iterations: LoopControl::default(),
|
||||
is_test,
|
||||
};
|
||||
|
||||
stdout
|
||||
.set_color(ColorSpec::new().set_fg(Some(Color::Blue)))
|
||||
.unwrap();
|
||||
println!("Running benchmark: {}", bench.name);
|
||||
stdout.reset().unwrap();
|
||||
|
||||
let results = {
|
||||
profiling::scope!("bench", bench.name);
|
||||
let r = (bench.func)(ctx);
|
||||
match r {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!(" Error running benchmark '{}': {:?}", bench.name, e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let previous_results = if let Some(ref baseline) = baseline {
|
||||
baseline.results.get(bench.name).map(|r| r.as_slice())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
print::print_results(&mut stdout, &results, previous_results);
|
||||
|
||||
output_file.results.insert(bench.name.to_string(), results);
|
||||
}
|
||||
|
||||
file::write_results_file(PREVIOUS, &output_file).unwrap();
|
||||
if let Some(output_baseline) = write_baseline {
|
||||
file::write_results_file(&output_baseline, &output_file).unwrap();
|
||||
}
|
||||
}
|
||||
206
benches/src/print.rs
Normal file
206
benches/src/print.rs
Normal file
@ -0,0 +1,206 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
|
||||
use termcolor::{Color, ColorSpec, StandardStream, WriteColor};
|
||||
|
||||
use crate::SubBenchResult;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
struct Delta {
|
||||
throughput_change_str: String,
|
||||
throughput_change: f64,
|
||||
time_change_str: String,
|
||||
time_change: f64,
|
||||
}
|
||||
|
||||
impl Delta {
|
||||
fn new(previous: &SubBenchResult, current: &SubBenchResult) -> Self {
|
||||
let prev_throughput = previous.throughput_per_second();
|
||||
let curr_throughput = current.throughput_per_second();
|
||||
let delta_throughput = if prev_throughput != 0.0 {
|
||||
(curr_throughput - prev_throughput) / prev_throughput * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let throughput_change = format!(" ({delta_throughput:+.2}%)");
|
||||
|
||||
let prev_time = previous.avg_duration_per_iteration;
|
||||
let curr_time = current.avg_duration_per_iteration;
|
||||
let delta_time = if prev_time.as_nanos() != 0 {
|
||||
(curr_time.as_secs_f64() - prev_time.as_secs_f64()) / prev_time.as_secs_f64() * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let time_change = format!("{delta_time:+.2}%; ");
|
||||
|
||||
Delta {
|
||||
throughput_change_str: throughput_change,
|
||||
throughput_change: delta_throughput,
|
||||
time_change_str: time_change,
|
||||
time_change: delta_time,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a color spec for the given change percentage.
|
||||
///
|
||||
/// Positive changes are red (regression), negative changes are green (improvement).
|
||||
/// This represents changes for time durations. For throughput changes, the sign should be inverted
|
||||
/// before passing to this method.
|
||||
fn get_change_color(percent_change: f64) -> ColorSpec {
|
||||
let mut color_spec = ColorSpec::new();
|
||||
if percent_change > 3.0 {
|
||||
color_spec.set_fg(Some(Color::Red));
|
||||
} else if percent_change < -3.0 {
|
||||
color_spec.set_fg(Some(Color::Green));
|
||||
} else {
|
||||
color_spec.set_fg(Some(Color::Yellow));
|
||||
}
|
||||
if percent_change.abs() > 15.0 {
|
||||
color_spec.set_intense(true);
|
||||
}
|
||||
color_spec
|
||||
}
|
||||
|
||||
pub fn print_results(
|
||||
stdout: &mut StandardStream,
|
||||
results: &[SubBenchResult],
|
||||
previous_results: Option<&[SubBenchResult]>,
|
||||
) {
|
||||
let mut deltas = HashMap::new();
|
||||
if let Some(previous_results) = previous_results {
|
||||
for result in results {
|
||||
if let Some(previous_result) = previous_results.iter().find(|r| r.name == result.name) {
|
||||
deltas.insert(result.name.clone(), Delta::new(previous_result, result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let longest_throughput_change_len = deltas
|
||||
.values()
|
||||
.map(|d| d.throughput_change_str.len())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
let longest_time_change_len = deltas
|
||||
.values()
|
||||
.map(|d| d.time_change_str.len())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
let longest_name_len = results.iter().map(|r| r.name.len()).max().unwrap_or(0);
|
||||
let duration_strings: Vec<String> = results
|
||||
.iter()
|
||||
.map(|r| format!("{:.3?}", r.avg_duration_per_iteration))
|
||||
.collect();
|
||||
let longest_duration_len = duration_strings.iter().map(|s| s.len()).max().unwrap_or(0);
|
||||
|
||||
let iterations_strings: Vec<String> = results
|
||||
.iter()
|
||||
.map(|r| format!("{}", r.iterations))
|
||||
.collect();
|
||||
let longest_iterations_len = iterations_strings
|
||||
.iter()
|
||||
.map(|s| s.len())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
let throughput_strings: Vec<String> = results
|
||||
.iter()
|
||||
.map(|r| {
|
||||
let throughput_per_second = r.throughput_count_per_iteration as f64
|
||||
/ r.avg_duration_per_iteration.as_secs_f64();
|
||||
human_scale(throughput_per_second)
|
||||
})
|
||||
.collect();
|
||||
let longest_throughput_len = throughput_strings
|
||||
.iter()
|
||||
.map(|s| s.len())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
let longest_throughput_unit_len = results
|
||||
.iter()
|
||||
.map(|r| r.throughput_unit.len())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
let delta = deltas.get(&result.name).cloned().unwrap_or_default();
|
||||
let time_color = get_change_color(delta.time_change);
|
||||
let throughput_color = get_change_color(-delta.throughput_change);
|
||||
|
||||
stdout
|
||||
.set_color(ColorSpec::new().set_fg(Some(Color::Cyan)))
|
||||
.unwrap();
|
||||
write!(stdout, " {:>longest_name_len$}: ", result.name).unwrap();
|
||||
|
||||
stdout.set_color(&time_color).unwrap();
|
||||
write!(stdout, "{:>longest_duration_len$} ", duration_strings[i],).unwrap();
|
||||
stdout.reset().unwrap();
|
||||
write!(stdout, "(").unwrap();
|
||||
stdout.set_color(&time_color).unwrap();
|
||||
write!(
|
||||
stdout,
|
||||
"{:>longest_time_change_len$}",
|
||||
delta.time_change_str
|
||||
)
|
||||
.unwrap();
|
||||
stdout.reset().unwrap();
|
||||
|
||||
write!(
|
||||
stdout,
|
||||
"over {:>longest_iterations_len$} iter) ",
|
||||
result.iterations,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
stdout.set_color(&throughput_color).unwrap();
|
||||
write!(stdout, "{:>longest_throughput_len$}", throughput_strings[i]).unwrap();
|
||||
stdout.reset().unwrap();
|
||||
write!(
|
||||
stdout,
|
||||
" {:>longest_throughput_unit_len$}/s",
|
||||
result.throughput_unit,
|
||||
)
|
||||
.unwrap();
|
||||
stdout.set_color(&throughput_color).unwrap();
|
||||
writeln!(
|
||||
stdout,
|
||||
"{:>longest_throughput_change_len$}",
|
||||
delta.throughput_change_str
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
fn human_scale(value: f64) -> String {
|
||||
const PREFIXES: &[&str] = &["", "K", "M", "G", "T", "P"];
|
||||
|
||||
if value == 0.0 {
|
||||
return "0".to_string();
|
||||
}
|
||||
|
||||
let abs_value = value.abs();
|
||||
let exponent = (abs_value.log10() / 3.0).floor() as usize;
|
||||
let prefix_index = exponent.min(PREFIXES.len() - 1);
|
||||
|
||||
let scaled = value / 10_f64.powi((prefix_index * 3) as i32);
|
||||
|
||||
// Determine decimal places for 3 significant figures
|
||||
let decimal_places = if scaled.abs() >= 100.0 {
|
||||
0
|
||||
} else if scaled.abs() >= 10.0 {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
};
|
||||
|
||||
format!(
|
||||
"{:.prec$}{}",
|
||||
scaled,
|
||||
PREFIXES[prefix_index],
|
||||
prec = decimal_places
|
||||
)
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user