Setup and use new benchmarking harness (#8511)

2025-12-08 21:26:17 +00:00 · 2025-11-18 15:28:21 -05:00 · 2025-11-18 15:28:21 -05:00 · 853ad6c464
commit 853ad6c464
parent 6043b059c4
15 changed files with 1313 additions and 926 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -118,12 +118,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "anes"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
-
 [[package]]
 name = "anstream"
 version = "0.6.21"
@ -592,12 +586,6 @@ dependencies = [
 "thiserror 2.0.17",
 ]

-[[package]]
-name = "cast"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
-
 [[package]]
 name = "cc"
 version = "1.2.46"
@ -652,33 +640,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "ciborium"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
-dependencies = [
- "ciborium-io",
- "ciborium-ll",
- "serde",
-]
-
-[[package]]
-name = "ciborium-io"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
-
-[[package]]
-name = "ciborium-ll"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
-dependencies = [
- "ciborium-io",
- "half",
-]
-
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@ -884,39 +845,6 @@ dependencies = [
 "cfg-if",
 ]

-[[package]]
-name = "criterion"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
-dependencies = [
- "anes",
- "cast",
- "ciborium",
- "clap",
- "criterion-plot",
- "itertools 0.13.0",
- "num-traits",
- "oorandom",
- "plotters",
- "rayon",
- "regex",
- "serde",
- "serde_json",
- "tinytemplate",
- "walkdir",
-]
-
-[[package]]
-name = "criterion-plot"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
-dependencies = [
- "cast",
- "itertools 0.13.0",
-]
-
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.6"
@ -3022,12 +2950,6 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"

-[[package]]
-name = "oorandom"
-version = "11.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
-
 [[package]]
 name = "orbclient"
 version = "0.3.49"
@ -3185,34 +3107,6 @@ dependencies = [
 "winit 0.29.15",
 ]

-[[package]]
-name = "plotters"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
-dependencies = [
- "num-traits",
- "plotters-backend",
- "plotters-svg",
- "wasm-bindgen",
- "web-sys",
-]
-
-[[package]]
-name = "plotters-backend"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
-
-[[package]]
-name = "plotters-svg"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
-dependencies = [
- "plotters-backend",
-]
-
 [[package]]
 name = "png"
 version = "0.18.0"
@ -4112,16 +4006,6 @@ dependencies = [
 "zerovec",
 ]

-[[package]]
-name = "tinytemplate"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
-dependencies = [
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "tokio"
 version = "1.48.0"
@ -4800,15 +4684,19 @@ dependencies = [
 name = "wgpu-benchmark"
 version = "27.0.0"
 dependencies = [
+ "anyhow",
 "bincode 2.0.1",
 "bytemuck",
- "criterion",
 "naga",
 "naga-test",
 "nanorand 0.8.0",
+ "pico-args",
 "pollster",
 "profiling",
 "rayon",
+ "serde",
+ "serde_json",
+ "termcolor",
 "tracy-client",
 "wgpu",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -250,7 +250,7 @@ deno_webgpu = { version = "0.181.0", path = "./deno_webgpu" }
 deno_unsync = "0.4.4"
 deno_error = "0.7.0"
 tokio = "1.47"
-termcolor = "1.1.3"
+termcolor = "1.4.1"

 # android dependencies
 ndk-sys = "0.6"
--- a/benches/Cargo.toml
+++ b/benches/Cargo.toml
@ -16,19 +16,18 @@ name = "wgpu-benchmark"
 harness = false

 [features]
-# Uncomment these features to enable tracy and superluminal profiling.
-# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
-# superluminal = ["profiling/profile-with-superluminal"]
+tracy = ["dep:tracy-client"]

 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = [
    'cfg(feature, values("tracy"))',
 ] }

-[dependencies]
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+anyhow.workspace = true
 bincode = { workspace = true, features = ["serde"] }
 bytemuck.workspace = true
-criterion.workspace = true
+# criterion.workspace = true
 naga = { workspace = true, features = [
    "deserialize",
    "serialize",
@ -43,8 +42,12 @@ naga = { workspace = true, features = [
 ] }
 naga-test = { workspace = true, features = [] }
 nanorand.workspace = true
+pico-args.workspace = true
 pollster.workspace = true
 profiling.workspace = true
 rayon.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+termcolor.workspace = true
 tracy-client = { workspace = true, optional = true }
 wgpu.workspace = true
--- a/benches/README.md
+++ b/benches/README.md
@ -1,9 +1,6 @@
 Collection of CPU benchmarks for `wgpu`.

 These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
-They all do very little GPU work and are testing the CPU performance of the API.
-
-Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.

 ## Usage

@ -14,65 +11,30 @@ cargo bench -p wgpu-benchmark
 cargo bench -p wgpu-benchmark -- "filter"
 ```

-## Benchmarks
-
-#### `Renderpass`
-
-This benchmark measures the performance of recording and submitting a render pass with a large
-number of draw calls and resources, emulating an intense, more traditional graphics application. 
-By default it measures 10k draw calls, with 90k total resources.
-
-Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
-the render pass into multiple passes over multiple command buffers.
-If available, it also tests a bindless approach, binding all textures at once instead of switching
-the bind group for every draw call.
-
-#### `Computepass`
-
-This benchmark measures the performance of recording and submitting a compute pass with a large
-number of dispatches and resources.
-By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
-
-Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
-the compute pass into multiple passes over multiple command buffers.
-If available, it also tests a bindless approach, binding all resources at once instead of switching
-the bind group for every draw call.
-TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
-
-
-#### `Resource Creation`
-
-This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
-
-#### `Shader Compilation`
-
-This benchmark measures the performance of naga parsing, validating, and generating shaders. 
+Use `WGPU_BACKEND` and `WGPU_ADAPTER_NAME` to adjust which device the benchmarks use. [More info on env vars](../README.md#environment-variables).

 ## Comparing Against a Baseline

 To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.

-For example, to compare v0.20 against trunk, you could run the following:
+For example, to compare v28 against trunk, you could run the following:

 ```sh
-git checkout v0.20
-
+git checkout v28
 # Run the baseline benchmarks
-cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
+cargo bench -p wgpu-benchmark -- --save-baseline "v28"

 git checkout trunk
-
 # Run the current benchmarks
-cargo bench -p wgpu-benchmark -- --baseline "v0.20"
+cargo bench -p wgpu-benchmark -- --baseline "v28"
 ```

-You can use this for any bits of code you want to compare.
+The current benchmarking framework was added before v28, so comparisons only work after it was added. Before that the same commands will work, but comparison will be done using `criterion`.

 ## Integration with Profilers

 The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
-Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
-you need to uncomment the features in the `Cargo.toml` to allow features to be used.
+Integrations are available for `tracy` and `superluminal`.

 #### Tracy

@ -80,7 +42,7 @@ Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/t

 ```sh
 # Once this is running, you can connect to it with the Tracy Profiler
-cargo bench -p wgpu-benchmark --features tracy
+cargo bench -p wgpu-benchmark --features tracy,profiling/profile-with-tracy
 ```

 #### Superluminal
@ -89,10 +51,10 @@ Superluminal is a paid product for windows available [here](https://superluminal

 ```sh
 # This command will build the benchmarks, and display the path to the executable
-cargo bench -p wgpu-benchmark --features superluminal -- -h
+cargo bench -p wgpu-benchmark --features profiling/profile-with-superluminal -- -h

 # Have Superluminal run the following command (replacing with the path to the executable)
-./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
+<path_to_exe> --bench "filter"
 ```

 #### `perf` and others
@ -105,6 +67,42 @@ For example, the command line tool `perf` can be used to profile the benchmarks.
 cargo bench -p wgpu-benchmark -- -h

 # Run the benchmarks with perf
-perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
+perf record <path_to_exe> --bench "filter"
 ```

+## Benchmarks
+
+#### `Renderpass Encoding`
+
+This benchmark measures the performance of recording and submitting a render pass with a large
+number of draw calls and resources, emulating an intense, more traditional graphics application. 
+By default it measures 10k draw calls, with 90k total resources.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the render pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all textures at once instead of switching
+the bind group for every draw call.
+
+#### `Computepass Encoding`
+
+This benchmark measures the performance of recording and submitting a compute pass with a large
+number of dispatches and resources.
+By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the compute pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all resources at once instead of switching
+the bind group for every draw call.
+TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
+
+#### `Device::create_buffer`
+
+This benchmark measures the performance of creating large buffers.
+
+#### `Device::create_bind_group`
+
+This benchmark measures the performance of creating large bind groups of 5 to 50,000 resources.
+
+#### `naga::back`, `naga::compact`, `naga::front`, and `naga::valid`
+
+These benchmark measures the performance of naga parsing, validating, and generating shaders. 
--- a/benches/benches/wgpu-benchmark/bind_groups.rs
+++ b/benches/benches/wgpu-benchmark/bind_groups.rs
@ -1,173 +1,127 @@
-use std::{
-    num::NonZeroU32,
-    time::{Duration, Instant},
-};
+use std::{num::NonZeroU32, time::Instant};

-use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
-use std::sync::LazyLock;
+use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};

-use crate::{is_test, DeviceState};
+use crate::DeviceState;
+
+struct Params {
+    max_texture_count: u32,
+    texture_counts: &'static [u32],
+}

 // Creating 50_000 textures takes a considerable amount of time with syncval enabled.
 //
 // We greatly reduce the number of textures for the test case to keep the runtime
 // reasonable for testing.
-const MAX_TEXTURE_COUNT_BENCHMARK: u32 = 50_000;
-const TEXTURE_COUNTS_BENCHMARK: &[u32] = &[5, 50, 500, 5_000, 50_000];
+const BENCHMARK_PARAMS: Params = Params {
+    max_texture_count: 50_000,
+    texture_counts: &[5, 50, 500, 5_000, 50_000],
+};

-const MAX_TEXTURE_COUNT_TEST: u32 = 5;
-const TEXTURE_COUNTS_TEST: &[u32] = &[5];
+const TEST_PARAMS: Params = Params {
+    max_texture_count: 5,
+    texture_counts: &[5],
+};

-struct BindGroupState {
-    device_state: DeviceState,
-    texture_views: Vec<wgpu::TextureView>,
-}
+pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
+    let device_state = DeviceState::new();

-impl BindGroupState {
-    /// Create and prepare all the resources needed for the renderpass benchmark.
-    fn new() -> Self {
-        let device_state = DeviceState::new();
-
-        let texture_count = if is_test() {
-            MAX_TEXTURE_COUNT_TEST
-        } else {
-            MAX_TEXTURE_COUNT_BENCHMARK
-        };
-
-        // Performance gets considerably worse if the resources are shuffled.
-        //
-        // This more closely matches the real-world use case where resources have no
-        // well defined usage order.
-        let mut random = WyRand::new_seed(0x8BADF00D);
-
-        let mut texture_views = Vec::with_capacity(texture_count as usize);
-        for i in 0..texture_count {
-            let texture = device_state
-                .device
-                .create_texture(&wgpu::TextureDescriptor {
-                    label: Some(&format!("Texture {i}")),
-                    size: wgpu::Extent3d {
-                        width: 1,
-                        height: 1,
-                        depth_or_array_layers: 1,
-                    },
-                    mip_level_count: 1,
-                    sample_count: 1,
-                    dimension: wgpu::TextureDimension::D2,
-                    format: wgpu::TextureFormat::Rgba8UnormSrgb,
-                    usage: wgpu::TextureUsages::TEXTURE_BINDING,
-                    view_formats: &[],
-                });
-            texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
-                label: Some(&format!("Texture View {i}")),
-                ..Default::default()
-            }));
-        }
-        random.shuffle(&mut texture_views);
-
-        Self {
-            device_state,
-            texture_views,
-        }
+    if !device_state
+        .device
+        .features()
+        .contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
+    {
+        anyhow::bail!("Device does not support required feature TEXTURE_BINDING_ARRAY");
    }
-}

-fn run_bench(ctx: &mut Criterion) {
-    let state = LazyLock::new(BindGroupState::new);
-
-    let mut group = ctx.benchmark_group("Bind Group Creation");
-
-    let count_list = if is_test() {
-        TEXTURE_COUNTS_TEST
+    let params = if ctx.is_test() {
+        TEST_PARAMS
    } else {
-        TEXTURE_COUNTS_BENCHMARK
+        BENCHMARK_PARAMS
    };

-    for &count in count_list {
-        group.throughput(Throughput::Elements(count as u64));
-        group.bench_with_input(
-            format!("{count} Element Bind Group"),
-            &count,
-            |b, &count| {
-                b.iter_custom(|iters| {
-                    if !state
-                        .device_state
-                        .device
-                        .features()
-                        .contains(wgpu::Features::TEXTURE_BINDING_ARRAY)
-                    {
-                        return Duration::ZERO;
-                    }
+    // Performance gets considerably worse if the resources are shuffled.
+    //
+    // This more closely matches the real-world use case where resources have no
+    // well defined usage order.
+    let mut random = WyRand::new_seed(0x8BADF00D);

-                    if count
-                        > state
-                            .device_state
-                            .device
-                            .limits()
-                            .max_sampled_textures_per_shader_stage
-                    {
-                        return Duration::ZERO;
-                    }
-
-                    let bind_group_layout = state.device_state.device.create_bind_group_layout(
-                        &wgpu::BindGroupLayoutDescriptor {
-                            label: None,
-                            entries: &[wgpu::BindGroupLayoutEntry {
-                                binding: 0,
-                                visibility: wgpu::ShaderStages::FRAGMENT,
-                                ty: wgpu::BindingType::Texture {
-                                    sample_type: wgpu::TextureSampleType::Float {
-                                        filterable: true,
-                                    },
-                                    view_dimension: wgpu::TextureViewDimension::D2,
-                                    multisampled: false,
-                                },
-                                count: Some(NonZeroU32::new(count).unwrap()),
-                            }],
-                        },
-                    );
-
-                    let texture_view_refs: Vec<_> =
-                        state.texture_views.iter().take(count as usize).collect();
-
-                    let mut duration = Duration::ZERO;
-                    for _ in 0..iters {
-                        profiling::scope!("benchmark iteration");
-
-                        let start = Instant::now();
-                        let bind_group = state.device_state.device.create_bind_group(
-                            &wgpu::BindGroupDescriptor {
-                                layout: &bind_group_layout,
-                                entries: &[wgpu::BindGroupEntry {
-                                    binding: 0,
-                                    resource: wgpu::BindingResource::TextureViewArray(
-                                        &texture_view_refs,
-                                    ),
-                                }],
-                                label: None,
-                            },
-                        );
-
-                        duration += start.elapsed();
-
-                        drop(bind_group);
-                        state
-                            .device_state
-                            .device
-                            .poll(wgpu::PollType::wait_indefinitely())
-                            .unwrap();
-                    }
-
-                    duration
-                });
-            },
-        );
+    let mut texture_views = Vec::with_capacity(params.max_texture_count as usize);
+    for i in 0..params.max_texture_count {
+        let texture = device_state
+            .device
+            .create_texture(&wgpu::TextureDescriptor {
+                label: Some(&format!("Texture {i}")),
+                size: wgpu::Extent3d {
+                    width: 1,
+                    height: 1,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                usage: wgpu::TextureUsages::TEXTURE_BINDING,
+                view_formats: &[],
+            });
+        texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
+            label: Some(&format!("Texture View {i}")),
+            ..Default::default()
+        }));
    }
-}
+    random.shuffle(&mut texture_views);

-criterion_group! {
-    name = bind_groups;
-    config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = run_bench,
+    let mut results = Vec::new();
+
+    for &count in params.texture_counts {
+        let bind_group_layout =
+            device_state
+                .device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: None,
+                    entries: &[wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: Some(NonZeroU32::new(count).unwrap()),
+                    }],
+                });
+
+        let texture_view_refs: Vec<_> = texture_views.iter().take(count as usize).collect();
+
+        let name = format!("{count} Textures");
+
+        let res = iter(&ctx, &name, "bindings", count, || {
+            let start = Instant::now();
+            let bind_group = device_state
+                .device
+                .create_bind_group(&wgpu::BindGroupDescriptor {
+                    layout: &bind_group_layout,
+                    entries: &[wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
+                    }],
+                    label: None,
+                });
+
+            let time = start.elapsed();
+
+            drop(bind_group);
+            device_state
+                .device
+                .poll(wgpu::PollType::wait_indefinitely())
+                .unwrap();
+
+            time
+        });
+
+        results.push(res);
+    }
+
+    Ok(results)
 }
--- a/benches/benches/wgpu-benchmark/computepass.rs
+++ b/benches/benches/wgpu-benchmark/computepass.rs
@ -3,17 +3,16 @@ use std::{
    time::{Duration, Instant},
 };

-use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
-use std::sync::LazyLock;
+use wgpu_benchmark::{iter_auto, iter_many, BenchmarkContext, LoopControl};

-use crate::{is_test, DeviceState};
+use crate::DeviceState;

-fn dispatch_count() -> usize {
+fn dispatch_count(ctx: &BenchmarkContext) -> usize {
    // When testing we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if is_test() {
+    if ctx.is_test() {
        8
    } else {
        10_000
@ -25,18 +24,18 @@ fn dispatch_count() -> usize {
 // This is in fact so slow that it makes the benchmark unusable when we use the same amount of
 // resources as the regular benchmark.
 // For details see https://github.com/gfx-rs/wgpu/issues/5766
-fn dispatch_count_bindless() -> usize {
+fn dispatch_count_bindless(ctx: &BenchmarkContext) -> usize {
    // On CI we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if is_test() {
+    if ctx.is_test() {
        8
    } else {
        1_000
    }
 }

-fn thread_count_list() -> &'static [usize] {
-    if is_test() {
+fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
+    if ctx.is_test() {
        &[2]
    } else {
        &[2, 4, 8]
@ -62,11 +61,11 @@ struct ComputepassState {

 impl ComputepassState {
    /// Create and prepare all the resources needed for the computepass benchmark.
-    fn new() -> Self {
+    fn new(ctx: &BenchmarkContext) -> Self {
        let device_state = DeviceState::new();

-        let dispatch_count = dispatch_count();
-        let dispatch_count_bindless = dispatch_count_bindless();
+        let dispatch_count = dispatch_count(ctx);
+        let dispatch_count_bindless = dispatch_count_bindless(ctx);
        let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
        let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
        let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
@ -377,10 +376,15 @@ impl ComputepassState {
        }
    }

-    fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
+    fn run_subpass(
+        &self,
+        ctx: &BenchmarkContext,
+        pass_number: usize,
+        total_passes: usize,
+    ) -> wgpu::CommandBuffer {
        profiling::scope!("Computepass", &format!("Pass {pass_number}/{total_passes}"));

-        let dispatch_count = dispatch_count();
+        let dispatch_count = dispatch_count(ctx);
        let dispatch_per_pass = dispatch_count / total_passes;

        let mut encoder = self
@ -431,183 +435,140 @@ impl ComputepassState {
    }
 }

-fn run_bench(ctx: &mut Criterion) {
-    let state = LazyLock::new(ComputepassState::new);
+pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
+    let state = ComputepassState::new(&ctx);

-    let dispatch_count = dispatch_count();
-    let dispatch_count_bindless = dispatch_count_bindless();
-    let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
-    let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
-    let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
+    ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
+
+    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+    if state.device_state.adapter_info.name.contains("Paravirtual") {
+        anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
+    }
+
+    let dispatch_count = dispatch_count(&ctx);
+    let dispatch_count_bindless = dispatch_count_bindless(&ctx);
+
+    let mut results = Vec::new();

    // Test 10k dispatch calls split up into 1, 2, 4, and 8 computepasses
-    let mut group = ctx.benchmark_group("Computepass: Single Threaded");
-    group.throughput(Throughput::Elements(dispatch_count as _));
+    for &cpasses in thread_count_list(&ctx) {
+        let labels = vec![
+            format!("Encoding ({cpasses} passes)"),
+            format!("Submit ({cpasses} passes)"),
+        ];

-    for time_submit in [false, true] {
-        for &cpasses in thread_count_list() {
-            let dispatch_per_pass = dispatch_count / cpasses;
+        results.extend(iter_many(
+            &ctx,
+            labels,
+            "dispatches",
+            dispatch_count as _,
+            || {
+                let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
+                let encoding_start = Instant::now();
+                for i in 0..cpasses {
+                    buffers.push(state.run_subpass(&ctx, i, cpasses));
+                }
+                let encoding_duration = encoding_start.elapsed();

-            let label = if time_submit {
-                "Submit Time"
-            } else {
-                "Computepass Time"
-            };
+                let submit_start = Instant::now();
+                state.device_state.queue.submit(buffers);
+                let submit_duration = submit_start.elapsed();

-            group.bench_function(
-                format!("{cpasses} computepasses x {dispatch_per_pass} dispatches ({label})"),
-                |b| {
-                    LazyLock::force(&state);
-
-                    b.iter_custom(|iters| {
-                        profiling::scope!("benchmark invocation");
-
-                        let mut duration = Duration::ZERO;
-
-                        for _ in 0..iters {
-                            profiling::scope!("benchmark iteration");
-
-                            let mut start = Instant::now();
-
-                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(cpasses);
-                            for i in 0..cpasses {
-                                buffers.push(state.run_subpass(i, cpasses));
-                            }
-
-                            if time_submit {
-                                start = Instant::now();
-                            } else {
-                                duration += start.elapsed();
-                            }
-
-                            state.device_state.queue.submit(buffers);
-
-                            if time_submit {
-                                duration += start.elapsed();
-                            }
-
-                            state
-                                .device_state
-                                .device
-                                .poll(wgpu::PollType::wait_indefinitely())
-                                .unwrap();
-                        }
-
-                        duration
-                    })
-                },
-            );
-        }
-    }
-    group.finish();
-
-    // Test 10k dispatch calls split up over 2, 4, and 8 threads.
-    let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
-    group.throughput(Throughput::Elements(dispatch_count as _));
-
-    for &threads in thread_count_list() {
-        let dispatch_per_pass = dispatch_count / threads;
-        group.bench_function(
-            format!("{threads} threads x {dispatch_per_pass} dispatch"),
-            |b| {
-                LazyLock::force(&state);
-
-                b.iter_custom(|iters| {
-                    profiling::scope!("benchmark invocation");
-
-                    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
-                    if state.device_state.adapter_info.name.contains("Paravirtual") {
-                        return Duration::from_secs_f32(1.0);
-                    }
-
-                    let mut duration = Duration::ZERO;
-
-                    for _ in 0..iters {
-                        profiling::scope!("benchmark iteration");
-
-                        let start = Instant::now();
-
-                        let buffers = (0..threads)
-                            .into_par_iter()
-                            .map(|i| state.run_subpass(i, threads))
-                            .collect::<Vec<_>>();
-
-                        duration += start.elapsed();
-
-                        state.device_state.queue.submit(buffers);
-                        state
-                            .device_state
-                            .device
-                            .poll(wgpu::PollType::wait_indefinitely())
-                            .unwrap();
-                    }
-
-                    duration
-                })
-            },
-        );
-    }
-    group.finish();
-
-    // Test 10k dispatch calls split up over 1, 2, 4, and 8 threads.
-    let mut group = ctx.benchmark_group("Computepass: Bindless");
-    group.throughput(Throughput::Elements(dispatch_count_bindless as _));
-
-    group.bench_function(format!("{dispatch_count_bindless} dispatch"), |b| {
-        LazyLock::force(&state);
-
-        b.iter_custom(|iters| {
-            profiling::scope!("benchmark invocation");
-
-            // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
-            if state.device_state.adapter_info.name.contains("Paravirtual") {
-                return Duration::from_secs_f32(1.0);
-            }
-
-            // Need bindless to run this benchmark
-            if state.bindless_bind_group.is_none() {
-                return Duration::from_secs(1);
-            }
-
-            let mut duration = Duration::ZERO;
-
-            for _ in 0..iters {
-                profiling::scope!("benchmark iteration");
-
-                let start = Instant::now();
-
-                let buffer = state.run_bindless_pass(dispatch_count_bindless);
-
-                duration += start.elapsed();
-
-                state.device_state.queue.submit([buffer]);
                state
                    .device_state
                    .device
                    .poll(wgpu::PollType::wait_indefinitely())
                    .unwrap();
-            }

-            duration
-        })
-    });
-    group.finish();
+                vec![encoding_duration, submit_duration]
+            },
+        ));
+    }

-    ctx.bench_function(
+    // Test 10k dispatch calls split up over 2, 4, and 8 threads.
+    for &threads in thread_count_list(&ctx) {
+        let labels = vec![
+            format!("Encoding ({threads} threads)"),
+            format!("Submit ({threads} threads)"),
+        ];
+
+        results.extend(iter_many(
+            &ctx,
+            labels,
+            "dispatches",
+            dispatch_count as _,
+            || {
+                let encoding_start = Instant::now();
+                let buffers = (0..threads)
+                    .into_par_iter()
+                    .map(|i| state.run_subpass(&ctx, i, threads))
+                    .collect::<Vec<_>>();
+                let encoding_duration = encoding_start.elapsed();
+
+                let submit_start = Instant::now();
+                state.device_state.queue.submit(buffers);
+                let submit_duration = submit_start.elapsed();
+
+                state
+                    .device_state
+                    .device
+                    .poll(wgpu::PollType::wait_indefinitely())
+                    .unwrap();
+
+                vec![encoding_duration, submit_duration]
+            },
+        ));
+    }
+
+    // Test 10k dispatch calls with bindless rendering.
+    if state.bindless_bind_group.is_some() {
+        let labels = vec![
+            "Encoding (bindless)".to_string(),
+            "Submit (bindless)".to_string(),
+        ];
+
+        results.extend(iter_many(
+            &ctx,
+            labels,
+            "dispatches",
+            dispatch_count_bindless as _,
+            || {
+                let encoding_start = Instant::now();
+                let buffer = state.run_bindless_pass(dispatch_count_bindless);
+                let encoding_duration = encoding_start.elapsed();
+
+                let submit_start = Instant::now();
+                state.device_state.queue.submit([buffer]);
+                let submit_duration = submit_start.elapsed();
+
+                state
+                    .device_state
+                    .device
+                    .poll(wgpu::PollType::wait_indefinitely())
+                    .unwrap();
+
+                vec![encoding_duration, submit_duration]
+            },
+        ));
+    }
+
+    // Test empty submit overhead with all resources
+    let texture_count = dispatch_count * TEXTURES_PER_DISPATCH;
+    let storage_buffer_count = dispatch_count * STORAGE_BUFFERS_PER_DISPATCH;
+    let storage_texture_count = dispatch_count * STORAGE_TEXTURES_PER_DISPATCH;
+
+    results.push(iter_auto(
+        &ctx,
        &format!(
-            "Computepass: Empty Submit with {} Resources",
+            "Empty Submit with {} Resources",
            texture_count + storage_texture_count + storage_buffer_count
        ),
-        |b| {
-            LazyLock::force(&state);
-
-            b.iter(|| state.device_state.queue.submit([]));
+        "submits",
+        1,
+        || {
+            state.device_state.queue.submit([]);
        },
-    );
-}
+    ));

-criterion_group! {
-    name = computepass;
-    config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = run_bench,
+    Ok(results)
 }
--- a/benches/benches/wgpu-benchmark/main.rs
+++ b/benches/benches/wgpu-benchmark/main.rs
@ -1,5 +1,7 @@
-use criterion::criterion_main;
+#[cfg_attr(target_arch = "wasm32", no_main)]
+#[cfg(not(target_arch = "wasm32"))]
 use pollster::block_on;
+use wgpu_benchmark::Benchmark;

 mod bind_groups;
 mod computepass;
@ -7,10 +9,6 @@ mod renderpass;
 mod resource_creation;
 mod shader;

-fn is_test() -> bool {
-    std::env::var("NEXTEST").is_ok()
-}
-
 struct DeviceState {
    adapter_info: wgpu::AdapterInfo,
    device: wgpu::Device,
@ -41,14 +39,17 @@ impl DeviceState {

        let adapter_info = adapter.get_info();

-        eprintln!("{adapter_info:?}");
+        println!(
+            "  Using adapter: {} ({:?})",
+            adapter_info.name, adapter_info.backend
+        );

        let (device, queue) = block_on(adapter.request_device(&wgpu::DeviceDescriptor {
            required_features: adapter.features(),
            required_limits: adapter.limits(),
            memory_hints: wgpu::MemoryHints::Performance,
            experimental_features: unsafe { wgpu::ExperimentalFeatures::enabled() },
-            label: Some("Compute/RenderPass Device"),
+            label: None,
            trace: wgpu::Trace::Off,
        }))
        .unwrap();
@ -61,10 +62,41 @@ impl DeviceState {
    }
 }

-criterion_main!(
-    bind_groups::bind_groups,
-    renderpass::renderpass,
-    computepass::computepass,
-    resource_creation::resource_creation,
-    shader::shader
-);
+fn main() {
+    let benchmarks = vec![
+        Benchmark {
+            name: "Device::create_bind_group",
+            func: bind_groups::run_bench,
+        },
+        Benchmark {
+            name: "Device::create_buffer",
+            func: resource_creation::run_bench,
+        },
+        Benchmark {
+            name: "naga::front",
+            func: shader::frontends,
+        },
+        Benchmark {
+            name: "naga::valid",
+            func: shader::validation,
+        },
+        Benchmark {
+            name: "naga::compact",
+            func: shader::compact,
+        },
+        Benchmark {
+            name: "naga::back",
+            func: shader::backends,
+        },
+        Benchmark {
+            name: "Renderpass Encoding",
+            func: renderpass::run_bench,
+        },
+        Benchmark {
+            name: "Computepass Encoding",
+            func: computepass::run_bench,
+        },
+    ];
+
+    wgpu_benchmark::main(benchmarks);
+}
--- a/benches/benches/wgpu-benchmark/renderpass.rs
+++ b/benches/benches/wgpu-benchmark/renderpass.rs
@ -3,34 +3,33 @@ use std::{
    time::{Duration, Instant},
 };

-use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
-use std::sync::LazyLock;
+use wgpu_benchmark::{iter_many, BenchmarkContext, LoopControl};

-use crate::{is_test, DeviceState};
+use crate::DeviceState;

-fn draw_count() -> usize {
+fn draw_count(ctx: &BenchmarkContext) -> u32 {
    // When testing we only want to run a very lightweight version of the benchmark
    // to ensure that it does not break.
-    if is_test() {
+    if ctx.is_test() {
        8
    } else {
        10_000
    }
 }

-fn thread_count_list() -> &'static [usize] {
-    if is_test() {
+fn thread_count_list(ctx: &BenchmarkContext) -> &'static [u32] {
+    if ctx.is_test() {
        &[2]
    } else {
-        &[1, 2, 4, 8]
+        &[1, 2, 4]
    }
 }

 // Must match the number of textures in the renderpass.wgsl shader
-const TEXTURES_PER_DRAW: usize = 7;
-const VERTEX_BUFFERS_PER_DRAW: usize = 2;
+const TEXTURES_PER_DRAW: u32 = 7;
+const VERTEX_BUFFERS_PER_DRAW: u32 = 2;

 struct RenderpassState {
    device_state: DeviceState,
@ -47,10 +46,10 @@ struct RenderpassState {

 impl RenderpassState {
    /// Create and prepare all the resources needed for the renderpass benchmark.
-    fn new() -> Self {
+    fn new(ctx: &BenchmarkContext) -> Self {
        let device_state = DeviceState::new();

-        let draw_count = draw_count();
+        let draw_count = draw_count(ctx);
        let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
        let texture_count = draw_count * TEXTURES_PER_DRAW;

@ -69,10 +68,10 @@ impl RenderpassState {
        // well defined usage order.
        let mut random = WyRand::new_seed(0x8BADF00D);

-        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
        for i in 0..TEXTURES_PER_DRAW {
            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
-                binding: i as u32,
+                binding: i,
                visibility: wgpu::ShaderStages::FRAGMENT,
                ty: wgpu::BindingType::Texture {
                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
@ -91,7 +90,7 @@ impl RenderpassState {
                    entries: &bind_group_layout_entries,
                });

-        let mut texture_views = Vec::with_capacity(texture_count);
+        let mut texture_views = Vec::with_capacity(texture_count as usize);
        for i in 0..texture_count {
            let texture = device_state
                .device
@ -118,14 +117,14 @@ impl RenderpassState {

        let texture_view_refs: Vec<_> = texture_views.iter().collect();

-        let mut bind_groups = Vec::with_capacity(draw_count);
+        let mut bind_groups = Vec::with_capacity(draw_count as usize);
        for draw_idx in 0..draw_count {
-            let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+            let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW as usize);
            for tex_idx in 0..TEXTURES_PER_DRAW {
                entries.push(wgpu::BindGroupEntry {
-                    binding: tex_idx as u32,
+                    binding: tex_idx,
                    resource: wgpu::BindingResource::TextureView(
-                        &texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
+                        &texture_views[(draw_idx * TEXTURES_PER_DRAW + tex_idx) as usize],
                    ),
                });
            }
@ -155,7 +154,7 @@ impl RenderpassState {
                    push_constant_ranges: &[],
                });

-        let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count);
+        let mut vertex_buffers = Vec::with_capacity(vertex_buffer_count as usize);
        for _ in 0..vertex_buffer_count {
            vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
                label: None,
@ -166,7 +165,7 @@ impl RenderpassState {
        }
        random.shuffle(&mut vertex_buffers);

-        let mut index_buffers = Vec::with_capacity(draw_count);
+        let mut index_buffers = Vec::with_capacity(draw_count as usize);
        for _ in 0..draw_count {
            index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
                label: None,
@ -177,12 +176,12 @@ impl RenderpassState {
        }
        random.shuffle(&mut index_buffers);

-        let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
        for i in 0..VERTEX_BUFFERS_PER_DRAW {
-            vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
+            vertex_buffer_attributes.push(wgpu::vertex_attr_array![i => Float32x4]);
        }

-        let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW as usize);
        for attributes in &vertex_buffer_attributes {
            vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
                array_stride: 16,
@ -263,7 +262,7 @@ impl RenderpassState {
                                view_dimension: wgpu::TextureViewDimension::D2,
                                multisampled: false,
                            },
-                            count: Some(NonZeroU32::new(texture_count as u32).unwrap()),
+                            count: Some(NonZeroU32::new(texture_count).unwrap()),
                        }],
                    });

@ -343,9 +342,9 @@ impl RenderpassState {

    fn run_subpass(
        &self,
-        pass_number: usize,
-        total_passes: usize,
-        draw_count: usize,
+        pass_number: u32,
+        total_passes: u32,
+        draw_count: u32,
    ) -> wgpu::CommandBuffer {
        profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));

@ -377,15 +376,16 @@ impl RenderpassState {
        let end_idx = start_idx + draws_per_pass;
        for draw_idx in start_idx..end_idx {
            render_pass.set_pipeline(&self.pipeline);
-            render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
+            render_pass.set_bind_group(0, &self.bind_groups[draw_idx as usize], &[]);
            for i in 0..VERTEX_BUFFERS_PER_DRAW {
                render_pass.set_vertex_buffer(
-                    i as u32,
-                    self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
+                    i,
+                    self.vertex_buffers[(draw_idx * VERTEX_BUFFERS_PER_DRAW + i) as usize]
+                        .slice(..),
                );
            }
            render_pass.set_index_buffer(
-                self.index_buffers[draw_idx].slice(..),
+                self.index_buffers[draw_idx as usize].slice(..),
                wgpu::IndexFormat::Uint32,
            );
            render_pass.draw_indexed(0..3, 0, 0..1);
@ -396,7 +396,7 @@ impl RenderpassState {
        encoder.finish()
    }

-    fn run_bindless_pass(&self, draw_count: usize) -> wgpu::CommandBuffer {
+    fn run_bindless_pass(&self, draw_count: u32) -> wgpu::CommandBuffer {
        profiling::scope!("Bindless Renderpass");

        let mut encoder = self
@ -424,12 +424,12 @@ impl RenderpassState {
        render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
        render_pass.set_bind_group(0, Some(self.bindless_bind_group.as_ref().unwrap()), &[]);
        for i in 0..VERTEX_BUFFERS_PER_DRAW {
-            render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
+            render_pass.set_vertex_buffer(i, self.vertex_buffers[0].slice(..));
        }
        render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);

        for draw_idx in 0..draw_count {
-            render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
+            render_pass.draw_indexed(0..3, 0, draw_idx..draw_idx + 1);
        }

        drop(render_pass);
@ -438,178 +438,103 @@ impl RenderpassState {
    }
 }

-fn run_bench(ctx: &mut Criterion) {
-    let state = LazyLock::new(RenderpassState::new);
+pub fn run_bench(mut ctx: BenchmarkContext) -> anyhow::Result<Vec<wgpu_benchmark::SubBenchResult>> {
+    let state = RenderpassState::new(&ctx);

-    let draw_count = draw_count();
-    let vertex_buffer_count = draw_count * VERTEX_BUFFERS_PER_DRAW;
-    let texture_count = draw_count * TEXTURES_PER_DRAW;
+    ctx.default_iterations = LoopControl::Time(Duration::from_secs(3));
+
+    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+    if state.device_state.adapter_info.name.contains("Paravirtual") {
+        anyhow::bail!("Benchmark unsupported on Paravirtualized GPUs");
+    }
+
+    let draw_count = draw_count(&ctx);
+
+    let mut results = Vec::new();

    // Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
-    let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
-    group.throughput(Throughput::Elements(draw_count as _));
+    for &rpasses in thread_count_list(&ctx) {
+        let labels = vec![
+            format!("Encoding ({rpasses} passes)"),
+            format!("Submit ({rpasses} passes)"),
+        ];

-    for time_submit in [false, true] {
-        for &rpasses in thread_count_list() {
-            let draws_per_pass = draw_count / rpasses;
+        results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
+            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses as usize);
+            let encoding_start = Instant::now();
+            for i in 0..rpasses {
+                buffers.push(state.run_subpass(i, rpasses, draw_count));
+            }
+            let encoding_duration = encoding_start.elapsed();

-            let label = if time_submit {
-                "Submit Time"
-            } else {
-                "Renderpass Time"
-            };
+            let submit_start = Instant::now();
+            state.device_state.queue.submit(buffers);
+            let submit_duration = submit_start.elapsed();

-            group.bench_function(
-                format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
-                |b| {
-                    LazyLock::force(&state);
+            state
+                .device_state
+                .device
+                .poll(wgpu::PollType::wait_indefinitely())
+                .unwrap();

-                    b.iter_custom(|iters| {
-                        profiling::scope!("benchmark invocation");
-
-                        // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
-                        if state.device_state.adapter_info.name.contains("Paravirtual") {
-                            return Duration::from_secs(1);
-                        }
-
-                        let mut duration = Duration::ZERO;
-
-                        for _ in 0..iters {
-                            profiling::scope!("benchmark iteration");
-
-                            let mut start = Instant::now();
-
-                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
-                            for i in 0..rpasses {
-                                buffers.push(state.run_subpass(i, rpasses, draw_count));
-                            }
-
-                            if time_submit {
-                                start = Instant::now();
-                            } else {
-                                duration += start.elapsed();
-                            }
-
-                            state.device_state.queue.submit(buffers);
-
-                            if time_submit {
-                                duration += start.elapsed();
-                            }
-
-                            state
-                                .device_state
-                                .device
-                                .poll(wgpu::PollType::wait_indefinitely())
-                                .unwrap();
-                        }
-
-                        duration
-                    })
-                },
-            );
-        }
+            vec![encoding_duration, submit_duration]
+        }));
    }
-    group.finish();

    // Test 10k draw calls split up over 2, 4, and 8 threads.
-    let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
-    group.throughput(Throughput::Elements(draw_count as _));
+    for &threads in thread_count_list(&ctx) {
+        let labels = vec![
+            format!("Encoding ({threads} threads)"),
+            format!("Submit ({threads} threads)"),
+        ];

-    for &threads in thread_count_list() {
-        let draws_per_pass = draw_count / threads;
-        group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
-            LazyLock::force(&state);
+        results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
+            let encoding_start = Instant::now();
+            let buffers = (0..threads)
+                .into_par_iter()
+                .map(|i| state.run_subpass(i, threads, draw_count))
+                .collect::<Vec<_>>();
+            let encoding_duration = encoding_start.elapsed();

-            b.iter_custom(|iters| {
-                profiling::scope!("benchmark invocation");
+            let submit_start = Instant::now();
+            state.device_state.queue.submit(buffers);
+            let submit_duration = submit_start.elapsed();

-                // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
-                if state.device_state.adapter_info.name.contains("Paravirtual") {
-                    return Duration::from_secs_f32(1.0);
-                }
+            state
+                .device_state
+                .device
+                .poll(wgpu::PollType::wait_indefinitely())
+                .unwrap();

-                let mut duration = Duration::ZERO;
-
-                for _ in 0..iters {
-                    profiling::scope!("benchmark iteration");
-
-                    let start = Instant::now();
-
-                    let buffers = (0..threads)
-                        .into_par_iter()
-                        .map(|i| state.run_subpass(i, threads, draw_count))
-                        .collect::<Vec<_>>();
-
-                    duration += start.elapsed();
-
-                    state.device_state.queue.submit(buffers);
-                    state
-                        .device_state
-                        .device
-                        .poll(wgpu::PollType::wait_indefinitely())
-                        .unwrap();
-                }
-
-                duration
-            })
-        });
+            vec![encoding_duration, submit_duration]
+        }));
    }
-    group.finish();

-    // Test 10k draw calls split up over 1, 2, 4, and 8 threads.
-    let mut group = ctx.benchmark_group("Renderpass: Bindless");
-    group.throughput(Throughput::Elements(draw_count as _));
+    // Test 10k draw calls with bindless rendering.
+    if state.bindless_bind_group.is_some() {
+        let labels = vec![
+            "Encoding (bindless)".to_string(),
+            "Submit (bindless)".to_string(),
+        ];

-    group.bench_function(format!("{draw_count} draws"), |b| {
-        LazyLock::force(&state);
+        results.extend(iter_many(&ctx, labels, "draw calls", draw_count, || {
+            let encoding_start = Instant::now();
+            let buffer = state.run_bindless_pass(draw_count);
+            let encoding_duration = encoding_start.elapsed();

-        b.iter_custom(|iters| {
-            profiling::scope!("benchmark invocation");
+            let submit_start = Instant::now();
+            state.device_state.queue.submit([buffer]);
+            let submit_duration = submit_start.elapsed();

-            // Need bindless to run this benchmark
-            if state.bindless_bind_group.is_none() {
-                return Duration::from_secs_f32(1.0);
-            }
+            state
+                .device_state
+                .device
+                .poll(wgpu::PollType::wait_indefinitely())
+                .unwrap();

-            let mut duration = Duration::ZERO;
+            vec![encoding_duration, submit_duration]
+        }));
+    }

-            for _ in 0..iters {
-                profiling::scope!("benchmark iteration");
-
-                let start = Instant::now();
-
-                let buffer = state.run_bindless_pass(draw_count);
-
-                duration += start.elapsed();
-
-                state.device_state.queue.submit([buffer]);
-                state
-                    .device_state
-                    .device
-                    .poll(wgpu::PollType::wait_indefinitely())
-                    .unwrap();
-            }
-
-            duration
-        })
-    });
-    group.finish();
-
-    ctx.bench_function(
-        &format!(
-            "Renderpass: Empty Submit with {} Resources",
-            texture_count + vertex_buffer_count
-        ),
-        |b| {
-            LazyLock::force(&state);
-
-            b.iter(|| state.device_state.queue.submit([]));
-        },
-    );
-}
-
-criterion_group! {
-    name = renderpass;
-    config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = run_bench,
+    Ok(results)
 }
--- a/benches/benches/wgpu-benchmark/resource_creation.rs
+++ b/benches/benches/wgpu-benchmark/resource_creation.rs
@ -1,82 +1,62 @@
-use std::time::{Duration, Instant};
+use std::time::Instant;

-use criterion::{criterion_group, Criterion, Throughput};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
-use std::sync::LazyLock;
+use wgpu_benchmark::{iter, BenchmarkContext, SubBenchResult};

-use crate::{is_test, DeviceState};
+use crate::DeviceState;

-fn thread_count_list() -> &'static [usize] {
-    if is_test() {
+fn thread_count_list(ctx: &BenchmarkContext) -> &'static [usize] {
+    if ctx.is_test() {
        &[2]
    } else {
        &[1, 2, 4, 8]
    }
 }

-fn run_bench(ctx: &mut Criterion) {
-    let state = LazyLock::new(DeviceState::new);
+pub fn run_bench(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
+    let state = DeviceState::new();

    const RESOURCES_TO_CREATE: usize = 8;

-    let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
-    group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
-
-    for &threads in thread_count_list() {
+    let mut results = Vec::new();
+    for &threads in thread_count_list(&ctx) {
        let resources_per_thread = RESOURCES_TO_CREATE / threads;
-        group.bench_function(
-            format!("{threads} threads x {resources_per_thread} resource"),
-            |b| {
-                LazyLock::force(&state);

-                b.iter_custom(|iters| {
-                    profiling::scope!("benchmark invocation");
-
-                    let mut duration = Duration::ZERO;
-
-                    for _ in 0..iters {
-                        profiling::scope!("benchmark iteration");
-
-                        // We can't create too many resources at once, so we do it 8 resources at a time.
-                        let start = Instant::now();
-
-                        let buffers = (0..threads)
-                            .into_par_iter()
+        results.push(iter(
+            &ctx,
+            &format!("{threads} threads"),
+            "buffers",
+            RESOURCES_TO_CREATE as u32,
+            || {
+                let start = Instant::now();
+                let buffers = (0..threads)
+                    .into_par_iter()
+                    .map(|_| {
+                        (0..resources_per_thread)
                            .map(|_| {
-                                (0..resources_per_thread)
-                                    .map(|_| {
-                                        state.device.create_buffer(&wgpu::BufferDescriptor {
-                                            label: None,
-                                            size: 256 * 1024 * 1024,
-                                            usage: wgpu::BufferUsages::COPY_DST,
-                                            mapped_at_creation: false,
-                                        })
-                                    })
-                                    .collect::<Vec<_>>()
+                                state.device.create_buffer(&wgpu::BufferDescriptor {
+                                    label: None,
+                                    size: 256 * 1024 * 1024,
+                                    usage: wgpu::BufferUsages::COPY_DST,
+                                    mapped_at_creation: false,
+                                })
                            })
-                            .collect::<Vec<_>>();
+                            .collect::<Vec<_>>()
+                    })
+                    .collect::<Vec<_>>();
+                let duration = start.elapsed();

-                        duration += start.elapsed();
+                drop(buffers);

-                        drop(buffers);
+                state.queue.submit([]);
+                state
+                    .device
+                    .poll(wgpu::PollType::wait_indefinitely())
+                    .unwrap();

-                        state.queue.submit([]);
-                        state
-                            .device
-                            .poll(wgpu::PollType::wait_indefinitely())
-                            .unwrap();
-                    }
-
-                    duration
-                })
+                duration
            },
-        );
+        ));
    }
-    group.finish();
-}
-
-criterion_group! {
-    name = resource_creation;
-    config = Criterion::default().measurement_time(Duration::from_secs(10));
-    targets = run_bench,
+    Ok(results)
 }
--- a/benches/benches/wgpu-benchmark/shader.rs
+++ b/benches/benches/wgpu-benchmark/shader.rs
@ -1,5 +1,5 @@
-use criterion::*;
 use std::{fs, process::Command};
+use wgpu_benchmark::{iter_auto, BenchmarkContext, SubBenchResult};

 const DIR_IN: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../naga/tests/in");

@ -141,28 +141,32 @@ fn get_wgsl_inputs() -> Inputs {
    Inputs { inner: inputs }
 }

-fn frontends(c: &mut Criterion) {
-    let mut group = c.benchmark_group("front");
+pub fn frontends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
+    let mut results = Vec::new();

    let mut inputs_wgsl = get_wgsl_inputs();

-    group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
-    group.bench_function("shader: naga module bincode decode", |b| {
-        inputs_wgsl.parse();
+    inputs_wgsl.parse();
+    inputs_wgsl.load_utf8();

-        let inputs_bin = inputs_wgsl
-            .inner
-            .iter()
-            .map(|input| {
-                bincode::serde::encode_to_vec(
-                    input.module.as_ref().unwrap(),
-                    bincode::config::standard(),
-                )
-                .unwrap()
-            })
-            .collect::<Vec<_>>();
+    let inputs_bin = inputs_wgsl
+        .inner
+        .iter()
+        .map(|input| {
+            bincode::serde::encode_to_vec(
+                input.module.as_ref().unwrap(),
+                bincode::config::standard(),
+            )
+            .unwrap()
+        })
+        .collect::<Vec<_>>();

-        b.iter(move || {
+    results.push(iter_auto(
+        &ctx,
+        "bincode decode",
+        "bytes",
+        inputs_wgsl.bytes() as u32,
+        move || {
            for input in inputs_bin.iter() {
                bincode::serde::decode_from_slice::<naga::Module, _>(
                    input,
@ -170,20 +174,23 @@ fn frontends(c: &mut Criterion) {
                )
                .unwrap();
            }
-        });
-    });
+        },
+    ));

-    group.bench_function("shader: wgsl-in", |b| {
-        inputs_wgsl.load_utf8();
+    let mut frontend = naga::front::wgsl::Frontend::new();

-        let mut frontend = naga::front::wgsl::Frontend::new();
-        b.iter(|| {
+    results.push(iter_auto(
+        &ctx,
+        "wgsl",
+        "bytes",
+        inputs_wgsl.bytes() as u32,
+        || {
            for input in &inputs_wgsl.inner {
                frontend.set_options((&input.options.wgsl_in).into());
                frontend.parse(input.string.as_ref().unwrap()).unwrap();
            }
-        });
-    });
+        },
+    ));

    let inputs_spirv = Inputs::from_dir("spv", "spvasm");
    assert!(!inputs_spirv.is_empty());
@ -220,13 +227,16 @@ fn frontends(c: &mut Criterion) {
        assembled_spirv.push(bytemuck::pod_collect_to_vec(&output.stdout));
    }

-    let total_bytes = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();
+    let total_bytes: u64 = assembled_spirv.iter().map(|spv| spv.len() as u64).sum();

    assert!(assembled_spirv.len() == inputs_spirv.inner.len() || assembled_spirv.is_empty());

-    group.throughput(Throughput::Bytes(total_bytes));
-    group.bench_function("shader: spv-in", |b| {
-        b.iter(|| {
+    results.push(iter_auto(
+        &ctx,
+        "spv parse",
+        "bytes",
+        total_bytes as u32,
+        || {
            for (i, input) in assembled_spirv.iter().enumerate() {
                let params = &inputs_spirv.inner[i].options;
                let SpirvInParameters {
@ -243,140 +253,152 @@ fn frontends(c: &mut Criterion) {
                );
                parser.parse().unwrap();
            }
-        });
-    });
+        },
+    ));

    let mut inputs_vertex = Inputs::from_dir("glsl", "vert");
    let mut inputs_fragment = Inputs::from_dir("glsl", "frag");
+    let mut inputs_compute = Inputs::from_dir("glsl", "comp");
    assert!(!inputs_vertex.is_empty());
    assert!(!inputs_fragment.is_empty());
-    // let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
-    group.throughput(Throughput::Bytes(
-        inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
-    ));
-    group.bench_function("shader: glsl-in", |b| {
-        inputs_vertex.load();
-        inputs_vertex.load_utf8();
-        inputs_fragment.load_utf8();
-        // inputs_compute.load_utf8();
+    assert!(!inputs_compute.is_empty());

-        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
-        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
-        // TODO: This one hangs for some reason
-        // b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
-    });
+    inputs_vertex.load_utf8();
+    inputs_fragment.load_utf8();
+    inputs_compute.load_utf8();
+
+    results.push(iter_auto(
+        &ctx,
+        "glsl parse",
+        "bytes",
+        (inputs_vertex.bytes() + inputs_fragment.bytes() + inputs_compute.bytes()) as u32,
+        || {
+            parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex);
+            parse_glsl(naga::ShaderStage::Fragment, &inputs_fragment);
+            parse_glsl(naga::ShaderStage::Compute, &inputs_compute);
+        },
+    ));
+
+    Ok(results)
 }

-fn validation(c: &mut Criterion) {
+pub fn validation(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
+    let mut results = Vec::new();
+
    let mut inputs = get_wgsl_inputs();

-    let mut group = c.benchmark_group("validate");
-    group.throughput(Throughput::Bytes(inputs.bytes()));
-    group.bench_function("shader: validation", |b| {
-        inputs.load();
-        inputs.load_utf8();
-        inputs.parse();
+    inputs.parse();

-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::all(),
-            naga::valid::Capabilities::all(),
-        );
-        validator
-            .subgroup_stages(naga::valid::ShaderStages::all())
-            .subgroup_operations(naga::valid::SubgroupOperationSet::all());
-        b.iter(|| {
+    let mut validator = naga::valid::Validator::new(
+        naga::valid::ValidationFlags::all(),
+        naga::valid::Capabilities::all(),
+    );
+    validator
+        .subgroup_stages(naga::valid::ShaderStages::all())
+        .subgroup_operations(naga::valid::SubgroupOperationSet::all());
+
+    results.push(iter_auto(
+        &ctx,
+        "validation",
+        "bytes",
+        inputs.bytes() as u32,
+        || {
            for input in &inputs.inner {
                validator.validate(input.module.as_ref().unwrap()).unwrap();
            }
-        });
-    });
-    group.finish();
+        },
+    ));
+
+    Ok(results)
 }

-fn compact(c: &mut Criterion) {
+pub fn compact(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
    use naga::compact::{compact, KeepUnused};

+    let mut results = Vec::new();
+
    let mut inputs = get_wgsl_inputs();

    inputs.validate();
    assert!(!inputs.is_empty());

-    let mut group = c.benchmark_group("compact");
-    group.throughput(Throughput::Bytes(inputs.bytes()));
-    group.bench_function("shader: compact", |b| {
-        b.iter(|| {
+    results.push(iter_auto(
+        &ctx,
+        "compact",
+        "bytes",
+        inputs.bytes() as u32,
+        || {
            for input in &mut inputs.inner {
                compact(input.module.as_mut().unwrap(), KeepUnused::No);
            }
-        });
-    });
-    group.finish();
+        },
+    ));
+
+    Ok(results)
 }

-fn backends(c: &mut Criterion) {
+pub fn backends(ctx: BenchmarkContext) -> anyhow::Result<Vec<SubBenchResult>> {
+    let mut results = Vec::new();
+
    let mut inputs = get_wgsl_inputs();

-    let mut group = c.benchmark_group("back");
-    // While normally this would be done inside the bench_function callback, we need to
-    // run this to properly know the size of the inputs, as any that fail validation
-    // will be removed.
    inputs.validate();
    assert!(!inputs.is_empty());

-    group.throughput(Throughput::Bytes(inputs.bytes()));
-    group.bench_function("shader: wgsl-out", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            for input in &inputs.inner {
-                if input.options.targets.unwrap().contains(Targets::WGSL) {
-                    let mut writer =
-                        naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
+    let total_bytes = inputs.bytes() as u32;
+
+    results.push(iter_auto(&ctx, "wgsl", "bytes", total_bytes, || {
+        let mut string = String::new();
+        for input in &inputs.inner {
+            if input.options.targets.unwrap().contains(Targets::WGSL) {
+                let mut writer =
+                    naga::back::wgsl::Writer::new(&mut string, (&input.options.wgsl).into());
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                );
+                string.clear();
+            }
+        }
+    }));
+
+    results.push(iter_auto(&ctx, "spv", "bytes", total_bytes, || {
+        let mut data = Vec::new();
+        let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
+        for input in &inputs.inner {
+            if input.options.targets.unwrap().contains(Targets::SPIRV) {
+                if input.filename().contains("pointer-function-arg") {
+                    continue;
+                }
+                let opt = input
+                    .options
+                    .spv
+                    .to_options(input.options.bounds_check_policies, None);
+                if writer.set_options(&opt).is_ok() {
                    let _ = writer.write(
                        input.module.as_ref().unwrap(),
                        input.module_info.as_ref().unwrap(),
+                        None,
+                        &None,
+                        &mut data,
                    );
-                    string.clear();
+                    data.clear();
                }
            }
-        });
-    });
+        }
+    }));

-    group.bench_function("shader: spv-out", |b| {
-        b.iter(|| {
-            let mut data = Vec::new();
-            let mut writer = naga::back::spv::Writer::new(&Default::default()).unwrap();
-            for input in &inputs.inner {
-                if input.options.targets.unwrap().contains(Targets::SPIRV) {
-                    if input.filename().contains("pointer-function-arg") {
-                        // These fail due to https://github.com/gfx-rs/wgpu/issues/7315
-                        continue;
-                    }
-                    let opt = input
-                        .options
-                        .spv
-                        .to_options(input.options.bounds_check_policies, None);
-                    if writer.set_options(&opt).is_ok() {
-                        let _ = writer.write(
-                            input.module.as_ref().unwrap(),
-                            input.module_info.as_ref().unwrap(),
-                            None,
-                            &None,
-                            &mut data,
-                        );
-                        data.clear();
-                    }
-                }
-            }
-        });
-    });
-    group.bench_function("shader: spv-out multiple entrypoints", |b| {
-        b.iter(|| {
+    results.push(iter_auto(
+        &ctx,
+        "spv multiple entrypoints",
+        "bytes",
+        total_bytes,
+        || {
            let mut data = Vec::new();
            let options = naga::back::spv::Options::default();
            for input in &inputs.inner {
                if input.options.targets.unwrap().contains(Targets::SPIRV) {
                    if input.filename().contains("pointer-function-arg") {
-                        // These fail due to https://github.com/gfx-rs/wgpu/issues/7315
                        continue;
                    }
                    let mut writer = naga::back::spv::Writer::new(&options).unwrap();
@ -397,51 +419,51 @@ fn backends(c: &mut Criterion) {
                    }
                }
            }
-        });
-    });
+        },
+    ));

-    group.bench_function("shader: msl-out", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let options = naga::back::msl::Options::default();
-            for input in &inputs.inner {
-                if input.options.targets.unwrap().contains(Targets::METAL) {
-                    let pipeline_options = naga::back::msl::PipelineOptions::default();
-                    let mut writer = naga::back::msl::Writer::new(&mut string);
-                    let _ = writer.write(
-                        input.module.as_ref().unwrap(),
-                        input.module_info.as_ref().unwrap(),
-                        &options,
-                        &pipeline_options,
-                    );
-                    string.clear();
-                }
+    results.push(iter_auto(&ctx, "msl", "bytes", total_bytes, || {
+        let mut string = String::new();
+        let options = naga::back::msl::Options::default();
+        for input in &inputs.inner {
+            if input.options.targets.unwrap().contains(Targets::METAL) {
+                let pipeline_options = naga::back::msl::PipelineOptions::default();
+                let mut writer = naga::back::msl::Writer::new(&mut string);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    &options,
+                    &pipeline_options,
+                );
+                string.clear();
            }
-        });
-    });
+        }
+    }));

-    group.bench_function("shader: hlsl-out", |b| {
-        b.iter(|| {
-            let options = naga::back::hlsl::Options::default();
-            let mut string = String::new();
-            for input in &inputs.inner {
-                if input.options.targets.unwrap().contains(Targets::HLSL) {
-                    let pipeline_options = Default::default();
-                    let mut writer =
-                        naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
-                    let _ = writer.write(
-                        input.module.as_ref().unwrap(),
-                        input.module_info.as_ref().unwrap(),
-                        None,
-                    ); // may fail on unimplemented things
-                    string.clear();
-                }
+    results.push(iter_auto(&ctx, "hlsl", "bytes", total_bytes, || {
+        let options = naga::back::hlsl::Options::default();
+        let mut string = String::new();
+        for input in &inputs.inner {
+            if input.options.targets.unwrap().contains(Targets::HLSL) {
+                let pipeline_options = Default::default();
+                let mut writer =
+                    naga::back::hlsl::Writer::new(&mut string, &options, &pipeline_options);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    None,
+                );
+                string.clear();
            }
-        });
-    });
+        }
+    }));

-    group.bench_function("shader: glsl-out multiple entrypoints", |b| {
-        b.iter(|| {
+    results.push(iter_auto(
+        &ctx,
+        "glsl multiple entrypoints",
+        "bytes",
+        total_bytes,
+        || {
            let mut string = String::new();
            let options = naga::back::glsl::Options {
                version: naga::back::glsl::Version::new_gles(320),
@ -462,7 +484,6 @@ fn backends(c: &mut Criterion) {
                        multiview: None,
                    };

-                    // might be `Err` if missing features
                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
                        &mut string,
                        module,
@ -471,14 +492,14 @@ fn backends(c: &mut Criterion) {
                        &pipeline_options,
                        naga::proc::BoundsCheckPolicies::default(),
                    ) {
-                        let _ = writer.write(); // might be `Err` if unsupported
+                        let _ = writer.write();
                    }

                    string.clear();
                }
            }
-        });
-    });
-}
+        },
+    ));

-criterion_group!(shader, frontends, validation, compact, backends);
+    Ok(results)
+}
--- a/benches/src/context.rs
+++ b/benches/src/context.rs
@ -0,0 +1,34 @@
+use std::time::Duration;
+
+#[derive(Clone, Copy)]
+pub enum LoopControl {
+    Iterations(u32),
+    Time(Duration),
+}
+
+impl Default for LoopControl {
+    fn default() -> Self {
+        LoopControl::Time(Duration::from_secs(2))
+    }
+}
+
+impl LoopControl {
+    pub(crate) fn finished(&self, iterations: u32, elapsed: Duration) -> bool {
+        match self {
+            LoopControl::Iterations(target) => iterations >= *target,
+            LoopControl::Time(target) => elapsed >= *target,
+        }
+    }
+}
+
+pub struct BenchmarkContext {
+    pub(crate) override_iters: Option<LoopControl>,
+    pub default_iterations: LoopControl,
+    pub(crate) is_test: bool,
+}
+
+impl BenchmarkContext {
+    pub fn is_test(&self) -> bool {
+        self.is_test
+    }
+}
--- a/benches/src/file.rs
+++ b/benches/src/file.rs
@ -0,0 +1,27 @@
+use anyhow::Context as _;
+
+use crate::BenchmarkFile;
+
+const FILE_PREFIX: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../target/bench/");
+pub const PREVIOUS: &str = "previous";
+
+pub(crate) fn get_comparison_file(baseline: Option<&str>) -> Option<BenchmarkFile> {
+    let file_name = baseline.unwrap_or(PREVIOUS);
+    let path = format!("{FILE_PREFIX}{file_name}.json");
+
+    let file = std::fs::read_to_string(path).ok()?;
+    let benchmark_file: BenchmarkFile = serde_json::from_str(&file).ok()?;
+    Some(benchmark_file)
+}
+
+pub(crate) fn write_results_file(
+    file_name: &str,
+    output_file: &BenchmarkFile,
+) -> anyhow::Result<()> {
+    let path = format!("{FILE_PREFIX}{file_name}.json");
+    let json = serde_json::to_string_pretty(output_file)?;
+    std::fs::create_dir_all(FILE_PREFIX)
+        .with_context(|| format!("Trying to create directory {FILE_PREFIX}"))?;
+    std::fs::write(&path, json).with_context(|| format!("Trying to write file {path}"))?;
+    Ok(())
+}
--- a/benches/src/iter.rs
+++ b/benches/src/iter.rs
@ -0,0 +1,97 @@
+use std::time::Duration;
+
+use crate::{BenchmarkContext, LoopControl, SubBenchResult};
+
+pub fn iter(
+    ctx: &BenchmarkContext,
+    name: &str,
+    throughput_unit: &str,
+    throughput_count_per_iteration: u32,
+    mut f: impl FnMut() -> Duration,
+) -> SubBenchResult {
+    profiling::scope!("iter", name);
+
+    let mut iterations = 0_u32;
+    let mut duration = Duration::ZERO;
+
+    let control = if let Some(override_control) = ctx.override_iters {
+        override_control
+    } else {
+        ctx.default_iterations
+    };
+
+    while !control.finished(iterations, duration) {
+        duration += f();
+        iterations += 1;
+    }
+
+    SubBenchResult {
+        name: name.to_string(),
+        avg_duration_per_iteration: duration / iterations,
+        iterations,
+        throughput_unit: throughput_unit.to_string(),
+        throughput_count_per_iteration,
+    }
+}
+
+pub fn iter_auto(
+    ctx: &BenchmarkContext,
+    name: &str,
+    throughput_unit: &str,
+    throughput_count_per_iteration: u32,
+    mut f: impl FnMut(),
+) -> SubBenchResult {
+    iter(
+        ctx,
+        name,
+        throughput_unit,
+        throughput_count_per_iteration,
+        || {
+            let start = std::time::Instant::now();
+            f();
+            start.elapsed()
+        },
+    )
+}
+
+pub fn iter_many(
+    ctx: &BenchmarkContext,
+    names: Vec<String>,
+    throughput_unit: &str,
+    throughput_count_per_iteration: u32,
+    mut f: impl FnMut() -> Vec<Duration>,
+) -> Vec<SubBenchResult> {
+    profiling::scope!("iter", &*names[0]);
+
+    let mut iterations = 0_u32;
+    let mut durations = vec![Duration::ZERO; names.len()];
+
+    let control = if let Some(override_control) = ctx.override_iters {
+        override_control
+    } else {
+        LoopControl::Time(Duration::from_secs(1))
+    };
+
+    // We use the first duration to determine whether to stop. This means the other sub-benchmarks
+    // could have run for longer or shorter than intended, but that's acceptable.
+    while !control.finished(iterations, *durations.first().unwrap_or(&Duration::ZERO)) {
+        let iteration_durations = f();
+        assert_eq!(iteration_durations.len(), names.len());
+        for (i, dur) in iteration_durations.into_iter().enumerate() {
+            durations[i] += dur;
+        }
+        iterations += 1;
+    }
+
+    durations
+        .into_iter()
+        .enumerate()
+        .map(|(i, d)| SubBenchResult {
+            name: names[i].to_string(),
+            avg_duration_per_iteration: d / iterations,
+            iterations,
+            throughput_unit: throughput_unit.to_string(),
+            throughput_count_per_iteration,
+        })
+        .collect()
+}
--- a/benches/src/lib.rs
+++ b/benches/src/lib.rs
@ -0,0 +1,261 @@
+#![cfg(not(target_arch = "wasm32"))]
+#![expect(clippy::disallowed_types)] // We're outside of the main wgpu codebase
+
+//! Benchmarking framework for `wgpu`.
+//!
+//! This crate is a basic framework for benchmarking. Its design is guided
+//! by a few goals:
+//!
+//! - Enumerating tests should be extremely cheap. `criterion` needs
+//!   to run all of your benchmark functions to enumerate them during
+//!   testing. This requires your code to contort itself to avoid doing
+//!   any work until you enter a benchmark callback. This framework
+//!   avoids that by having an explicit list of benchmark function.
+//! - It must be compatible with `cargo-nextest` and have a compatible
+//!   "test" mode that runs each benchmark exactly once.
+//! - It should be able to have intuitive test grouping, allowing for
+//!   allowing for quick execution of a reasonable baseline set of benchmarks
+//!   during development, while still allowing for a more exhaustive
+//!   benchmark suite to be run if desired.
+//!
+//! By default all tests run for 2 seconds, but this can be overridden
+//! by individual tests.
+
+use std::{collections::HashMap, io::IsTerminal, time::Duration};
+
+use anyhow::Result;
+use pico_args::Arguments;
+use serde::{Deserialize, Serialize};
+use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
+
+mod context;
+mod file;
+mod iter;
+mod print;
+
+pub use context::*;
+pub use iter::*;
+
+use crate::file::PREVIOUS;
+
+#[derive(Serialize, Deserialize, Default)]
+pub struct BenchmarkFile {
+    pub results: HashMap<String, Vec<SubBenchResult>>,
+}
+
+impl BenchmarkFile {
+    pub fn get_result(
+        &self,
+        benchmark_name: &str,
+        sub_benchmark_name: &str,
+    ) -> Option<&SubBenchResult> {
+        self.results
+            .get(benchmark_name)?
+            .iter()
+            .find(|r| r.name == sub_benchmark_name)
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct SubBenchResult {
+    /// Name of the subbenchmark.
+    pub name: String,
+    /// Average duration per iteration of the subbenchmark.
+    pub avg_duration_per_iteration: Duration,
+    /// Total number of iterations executed.
+    pub iterations: u32,
+    /// Throughput unit description. e.g., "bytes", "elements", etc.
+    pub throughput_unit: String,
+    /// Number of throughput units processed per iteration.
+    pub throughput_count_per_iteration: u32,
+}
+
+impl SubBenchResult {
+    pub fn throughput_per_second(&self) -> f64 {
+        let secs_f64 = self.avg_duration_per_iteration.as_secs_f64();
+        if secs_f64 == 0.0 {
+            return 0.0;
+        }
+        self.throughput_count_per_iteration as f64 / secs_f64
+    }
+}
+
+pub struct Benchmark {
+    pub name: &'static str,
+    pub func: fn(BenchmarkContext) -> Result<Vec<SubBenchResult>>,
+}
+
+const HELP: &str = "\
+Usage: wgpu-benchmark [OPTIONS] [BENCHMARK_NAME]
+
+Modes:
+    --bench                     Run in benchmark mode, comparing against previous results.
+    --list                      List available benchmarks.
+    <no flag>                   Run in test mode, executing each benchmark exactly once.
+
+Test Matching:
+    --exact                     When specifying BENCHMARK_NAME, only run exact matches.
+    BENCHMARK_NAME              Only run benchmarks whose names contain this substring.
+
+Comparison:
+    -b, --baseline NAME         Specify a baseline file for comparison.
+    -s, --save-baseline NAME    Save the results as a baseline file.
+
+Timings:
+    --iters N                   Override number of iterations per benchmark.
+    --time SECONDS              Override time per benchmark in seconds.
+
+Other:
+    --color                     Set colored output (always,always-ansi,auto,never).
+    --format terse              Specify --list output format (only 'terse' is supported).
+    --no-capture                (Ignored)
+";
+
+pub fn main(benchmarks: Vec<Benchmark>) {
+    let mut args = Arguments::from_env();
+
+    let help = args.contains(["-h", "--help"]);
+
+    if help {
+        println!("{HELP}");
+        return;
+    }
+
+    let mut color: ColorChoice = args
+        .opt_value_from_str("--color")
+        .unwrap_or(None)
+        .unwrap_or(ColorChoice::Auto);
+    if color == ColorChoice::Auto && !std::io::stdin().is_terminal() {
+        color = ColorChoice::Never;
+    }
+
+    let exact = args.contains("--exact");
+    // We don't actually need this flag, but cargo-nextest passes it in
+    // test mode, so we need to accept it.
+    let _no_capture = args.contains("--no-capture");
+
+    #[expect(clippy::manual_map)] // So much clearer this way
+    let mut override_iterations = if let Some(iters) = args.opt_value_from_str("--iters").unwrap() {
+        Some(LoopControl::Iterations(iters))
+    } else if let Some(seconds) = args.opt_value_from_str("--time").unwrap() {
+        Some(LoopControl::Time(Duration::from_secs_f64(seconds)))
+    } else {
+        None
+    };
+
+    let baseline_name: Option<String> = args.opt_value_from_str(["-b", "--baseline"]).unwrap();
+    let write_baseline: Option<String> =
+        args.opt_value_from_str(["-s", "--save-baseline"]).unwrap();
+
+    let is_bench = args.contains("--bench");
+    let is_list = args.contains("--list");
+    let is_test = !is_bench && !is_list;
+
+    let format: Option<String> = args.opt_value_from_str("--format").unwrap();
+
+    if let Some(fmt) = format {
+        assert_eq!(fmt, "terse", "Only 'terse' format is supported.");
+    }
+    if let Some(ref baseline) = baseline_name {
+        if baseline == PREVIOUS {
+            eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
+            return;
+        }
+    }
+    if let Some(ref write_baseline) = write_baseline {
+        if write_baseline == PREVIOUS {
+            eprintln!("Cannot use '{PREVIOUS}' as a baseline name.");
+            return;
+        }
+    }
+
+    if override_iterations.is_none() && is_test {
+        override_iterations = Some(LoopControl::Iterations(1));
+    }
+
+    let name = args.free_from_str::<String>().ok();
+
+    let baseline = if is_bench {
+        let res = file::get_comparison_file(baseline_name.as_deref());
+
+        match (&res, baseline_name.as_deref()) {
+            (Some(_), Some(baseline)) => {
+                println!("Using baseline \"{baseline}\" for comparison.\n")
+            }
+            (None, Some(baseline)) => {
+                eprintln!("Could not find baseline named {baseline:?}.\n");
+                return;
+            }
+            (Some(_), None) => {
+                println!("Using previous benchmark results for comparison.\n");
+            }
+            (None, None) => {
+                println!("No previous benchmark results found for comparison.\n");
+            }
+        }
+
+        res
+    } else {
+        None
+    };
+
+    let mut output_file = BenchmarkFile::default();
+
+    let mut stdout = StandardStream::stdout(color);
+
+    for bench in benchmarks {
+        if let Some(ref bench_name) = name {
+            if exact {
+                if bench.name != bench_name {
+                    continue;
+                }
+            } else if !bench.name.contains(bench_name) {
+                continue;
+            }
+        }
+
+        if is_list {
+            println!("{}: benchmark", bench.name);
+            continue;
+        }
+
+        let ctx = BenchmarkContext {
+            override_iters: override_iterations,
+            default_iterations: LoopControl::default(),
+            is_test,
+        };
+
+        stdout
+            .set_color(ColorSpec::new().set_fg(Some(Color::Blue)))
+            .unwrap();
+        println!("Running benchmark: {}", bench.name);
+        stdout.reset().unwrap();
+
+        let results = {
+            profiling::scope!("bench", bench.name);
+            let r = (bench.func)(ctx);
+            match r {
+                Ok(r) => r,
+                Err(e) => {
+                    eprintln!("  Error running benchmark '{}': {:?}", bench.name, e);
+                    continue;
+                }
+            }
+        };
+
+        let previous_results = if let Some(ref baseline) = baseline {
+            baseline.results.get(bench.name).map(|r| r.as_slice())
+        } else {
+            None
+        };
+
+        print::print_results(&mut stdout, &results, previous_results);
+
+        output_file.results.insert(bench.name.to_string(), results);
+    }
+
+    file::write_results_file(PREVIOUS, &output_file).unwrap();
+    if let Some(output_baseline) = write_baseline {
+        file::write_results_file(&output_baseline, &output_file).unwrap();
+    }
+}
--- a/benches/src/print.rs
+++ b/benches/src/print.rs
@ -0,0 +1,206 @@
+use std::collections::HashMap;
+use std::io::Write;
+
+use termcolor::{Color, ColorSpec, StandardStream, WriteColor};
+
+use crate::SubBenchResult;
+
+#[derive(Default, Clone)]
+struct Delta {
+    throughput_change_str: String,
+    throughput_change: f64,
+    time_change_str: String,
+    time_change: f64,
+}
+
+impl Delta {
+    fn new(previous: &SubBenchResult, current: &SubBenchResult) -> Self {
+        let prev_throughput = previous.throughput_per_second();
+        let curr_throughput = current.throughput_per_second();
+        let delta_throughput = if prev_throughput != 0.0 {
+            (curr_throughput - prev_throughput) / prev_throughput * 100.0
+        } else {
+            0.0
+        };
+        let throughput_change = format!(" ({delta_throughput:+.2}%)");
+
+        let prev_time = previous.avg_duration_per_iteration;
+        let curr_time = current.avg_duration_per_iteration;
+        let delta_time = if prev_time.as_nanos() != 0 {
+            (curr_time.as_secs_f64() - prev_time.as_secs_f64()) / prev_time.as_secs_f64() * 100.0
+        } else {
+            0.0
+        };
+
+        let time_change = format!("{delta_time:+.2}%; ");
+
+        Delta {
+            throughput_change_str: throughput_change,
+            throughput_change: delta_throughput,
+            time_change_str: time_change,
+            time_change: delta_time,
+        }
+    }
+}
+
+/// Get a color spec for the given change percentage.
+///
+/// Positive changes are red (regression), negative changes are green (improvement).
+/// This represents changes for time durations. For throughput changes, the sign should be inverted
+/// before passing to this method.
+fn get_change_color(percent_change: f64) -> ColorSpec {
+    let mut color_spec = ColorSpec::new();
+    if percent_change > 3.0 {
+        color_spec.set_fg(Some(Color::Red));
+    } else if percent_change < -3.0 {
+        color_spec.set_fg(Some(Color::Green));
+    } else {
+        color_spec.set_fg(Some(Color::Yellow));
+    }
+    if percent_change.abs() > 15.0 {
+        color_spec.set_intense(true);
+    }
+    color_spec
+}
+
+pub fn print_results(
+    stdout: &mut StandardStream,
+    results: &[SubBenchResult],
+    previous_results: Option<&[SubBenchResult]>,
+) {
+    let mut deltas = HashMap::new();
+    if let Some(previous_results) = previous_results {
+        for result in results {
+            if let Some(previous_result) = previous_results.iter().find(|r| r.name == result.name) {
+                deltas.insert(result.name.clone(), Delta::new(previous_result, result));
+            }
+        }
+    }
+
+    let longest_throughput_change_len = deltas
+        .values()
+        .map(|d| d.throughput_change_str.len())
+        .max()
+        .unwrap_or(0);
+    let longest_time_change_len = deltas
+        .values()
+        .map(|d| d.time_change_str.len())
+        .max()
+        .unwrap_or(0);
+
+    let longest_name_len = results.iter().map(|r| r.name.len()).max().unwrap_or(0);
+    let duration_strings: Vec<String> = results
+        .iter()
+        .map(|r| format!("{:.3?}", r.avg_duration_per_iteration))
+        .collect();
+    let longest_duration_len = duration_strings.iter().map(|s| s.len()).max().unwrap_or(0);
+
+    let iterations_strings: Vec<String> = results
+        .iter()
+        .map(|r| format!("{}", r.iterations))
+        .collect();
+    let longest_iterations_len = iterations_strings
+        .iter()
+        .map(|s| s.len())
+        .max()
+        .unwrap_or(0);
+
+    let throughput_strings: Vec<String> = results
+        .iter()
+        .map(|r| {
+            let throughput_per_second = r.throughput_count_per_iteration as f64
+                / r.avg_duration_per_iteration.as_secs_f64();
+            human_scale(throughput_per_second)
+        })
+        .collect();
+    let longest_throughput_len = throughput_strings
+        .iter()
+        .map(|s| s.len())
+        .max()
+        .unwrap_or(0);
+
+    let longest_throughput_unit_len = results
+        .iter()
+        .map(|r| r.throughput_unit.len())
+        .max()
+        .unwrap_or(0);
+
+    for (i, result) in results.iter().enumerate() {
+        let delta = deltas.get(&result.name).cloned().unwrap_or_default();
+        let time_color = get_change_color(delta.time_change);
+        let throughput_color = get_change_color(-delta.throughput_change);
+
+        stdout
+            .set_color(ColorSpec::new().set_fg(Some(Color::Cyan)))
+            .unwrap();
+        write!(stdout, "    {:>longest_name_len$}: ", result.name).unwrap();
+
+        stdout.set_color(&time_color).unwrap();
+        write!(stdout, "{:>longest_duration_len$} ", duration_strings[i],).unwrap();
+        stdout.reset().unwrap();
+        write!(stdout, "(").unwrap();
+        stdout.set_color(&time_color).unwrap();
+        write!(
+            stdout,
+            "{:>longest_time_change_len$}",
+            delta.time_change_str
+        )
+        .unwrap();
+        stdout.reset().unwrap();
+
+        write!(
+            stdout,
+            "over {:>longest_iterations_len$} iter) ",
+            result.iterations,
+        )
+        .unwrap();
+
+        stdout.set_color(&throughput_color).unwrap();
+        write!(stdout, "{:>longest_throughput_len$}", throughput_strings[i]).unwrap();
+        stdout.reset().unwrap();
+        write!(
+            stdout,
+            " {:>longest_throughput_unit_len$}/s",
+            result.throughput_unit,
+        )
+        .unwrap();
+        stdout.set_color(&throughput_color).unwrap();
+        writeln!(
+            stdout,
+            "{:>longest_throughput_change_len$}",
+            delta.throughput_change_str
+        )
+        .unwrap();
+    }
+    println!();
+}
+
+fn human_scale(value: f64) -> String {
+    const PREFIXES: &[&str] = &["", "K", "M", "G", "T", "P"];
+
+    if value == 0.0 {
+        return "0".to_string();
+    }
+
+    let abs_value = value.abs();
+    let exponent = (abs_value.log10() / 3.0).floor() as usize;
+    let prefix_index = exponent.min(PREFIXES.len() - 1);
+
+    let scaled = value / 10_f64.powi((prefix_index * 3) as i32);
+
+    // Determine decimal places for 3 significant figures
+    let decimal_places = if scaled.abs() >= 100.0 {
+        0
+    } else if scaled.abs() >= 10.0 {
+        1
+    } else {
+        2
+    };
+
+    format!(
+        "{:.prec$}{}",
+        scaled,
+        PREFIXES[prefix_index],
+        prec = decimal_places
+    )
+}