Speed Up Benchmarks in Test (#7129)

2025-12-08 21:26:17 +00:00 · 2025-02-13 18:48:13 -05:00 · 2025-02-13 18:48:13 -05:00 · 03a01df3cb
commit 03a01df3cb
parent 2f50426b35
5 changed files with 64 additions and 16 deletions
--- a/benches/benches/bind_groups.rs
+++ b/benches/benches/bind_groups.rs
@ -7,7 +7,17 @@ use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
 use std::sync::LazyLock;

-use crate::DeviceState;
+use crate::{is_test, DeviceState};
+
+// Creating 50_000 textures takes a considerable amount of time with syncval enabled.
+//
+// We greatly reduce the number of textures for the test case to keep the runtime
+// reasonable for testing.
+const MAX_TEXTURE_COUNT_BENCHMARK: u32 = 50_000;
+const TEXTURE_COUNTS_BENCHMARK: &[u32] = &[5, 50, 500, 5_000, 50_000];
+
+const MAX_TEXTURE_COUNT_TEST: u32 = 5;
+const TEXTURE_COUNTS_TEST: &[u32] = &[5];

 struct BindGroupState {
    device_state: DeviceState,
@ -19,7 +29,11 @@ impl BindGroupState {
    fn new() -> Self {
        let device_state = DeviceState::new();

-        const TEXTURE_COUNT: u32 = 50_000;
+        let texture_count = if is_test() {
+            MAX_TEXTURE_COUNT_TEST
+        } else {
+            MAX_TEXTURE_COUNT_BENCHMARK
+        };

        // Performance gets considerably worse if the resources are shuffled.
        //
@ -27,8 +41,8 @@ impl BindGroupState {
        // well defined usage order.
        let mut random = WyRand::new_seed(0x8BADF00D);

-        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT as usize);
-        for i in 0..TEXTURE_COUNT {
+        let mut texture_views = Vec::with_capacity(texture_count as usize);
+        for i in 0..texture_count {
            let texture = device_state
                .device
                .create_texture(&wgpu::TextureDescriptor {
@ -64,7 +78,13 @@ fn run_bench(ctx: &mut Criterion) {

    let mut group = ctx.benchmark_group("Bind Group Creation");

-    for count in [5, 50, 500, 5_000, 50_000] {
+    let count_list = if is_test() {
+        TEXTURE_COUNTS_TEST
+    } else {
+        TEXTURE_COUNTS_BENCHMARK
+    };
+
+    for &count in count_list {
        group.throughput(Throughput::Elements(count as u64));
        group.bench_with_input(
            format!("{} Element Bind Group", count),
--- a/benches/benches/computepass.rs
+++ b/benches/benches/computepass.rs
@ -8,12 +8,12 @@ use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;

-use crate::DeviceState;
+use crate::{is_test, DeviceState};

 fn dispatch_count() -> usize {
    // When testing we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
        8
    } else {
        10_000
@ -28,13 +28,21 @@ fn dispatch_count() -> usize {
 fn dispatch_count_bindless() -> usize {
    // On CI we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
        8
    } else {
        1_000
    }
 }

+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[2, 4, 8]
+    }
+}
+
 // Must match the number of textures in the computepass.wgsl shader
 const TEXTURES_PER_DISPATCH: usize = 2;
 const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
@ -437,7 +445,7 @@ fn run_bench(ctx: &mut Criterion) {
    group.throughput(Throughput::Elements(dispatch_count as _));

    for time_submit in [false, true] {
-        for cpasses in [1, 2, 4, 8] {
+        for &cpasses in thread_count_list() {
            let dispatch_per_pass = dispatch_count / cpasses;

            let label = if time_submit {
@ -493,7 +501,7 @@ fn run_bench(ctx: &mut Criterion) {
    let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
    group.throughput(Throughput::Elements(dispatch_count as _));

-    for threads in [2, 4, 8] {
+    for &threads in thread_count_list() {
        let dispatch_per_pass = dispatch_count / threads;
        group.bench_function(
            format!("{threads} threads x {dispatch_per_pass} dispatch"),
--- a/benches/benches/renderpass.rs
+++ b/benches/benches/renderpass.rs
@ -8,18 +8,26 @@ use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;

-use crate::DeviceState;
+use crate::{is_test, DeviceState};

 fn draw_count() -> usize {
    // When testing we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
        8
    } else {
        10_000
    }
 }

+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[1, 2, 4, 8]
+    }
+}
+
 // Must match the number of textures in the renderpass.wgsl shader
 const TEXTURES_PER_DRAW: usize = 7;
 const VERTEX_BUFFERS_PER_DRAW: usize = 2;
@ -438,7 +446,7 @@ fn run_bench(ctx: &mut Criterion) {
    group.throughput(Throughput::Elements(draw_count as _));

    for time_submit in [false, true] {
-        for rpasses in [1, 2, 4, 8] {
+        for &rpasses in thread_count_list() {
            let draws_per_pass = draw_count / rpasses;

            let label = if time_submit {
@ -499,7 +507,7 @@ fn run_bench(ctx: &mut Criterion) {
    let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
    group.throughput(Throughput::Elements(draw_count as _));

-    for threads in [2, 4, 8] {
+    for &threads in thread_count_list() {
        let draws_per_pass = draw_count / threads;
        group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
            LazyLock::force(&state);
--- a/benches/benches/resource_creation.rs
+++ b/benches/benches/resource_creation.rs
@ -4,7 +4,15 @@ use criterion::{criterion_group, Criterion, Throughput};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;

-use crate::DeviceState;
+use crate::{is_test, DeviceState};
+
+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[1, 2, 4, 8]
+    }
+}

 fn run_bench(ctx: &mut Criterion) {
    let state = LazyLock::new(DeviceState::new);
@ -14,7 +22,7 @@ fn run_bench(ctx: &mut Criterion) {
    let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
    group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));

-    for threads in [1, 2, 4, 8] {
+    for &threads in thread_count_list() {
        let resources_per_thread = RESOURCES_TO_CREATE / threads;
        group.bench_function(
            format!("{threads} threads x {resources_per_thread} resource"),
--- a/benches/benches/root.rs
+++ b/benches/benches/root.rs
@ -7,6 +7,10 @@ mod renderpass;
 mod resource_creation;
 mod shader;

+fn is_test() -> bool {
+    std::env::var("NEXTEST").is_ok()
+}
+
 struct DeviceState {
    adapter_info: wgpu::AdapterInfo,
    device: wgpu::Device,