[wgpu] add convience functions for deferring mapping/callbacks

Co-authored-by: Kevin Reid <kpreid@switchb.org>
2025-12-08 21:26:17 +00:00 · 2025-08-21 00:42:58 -04:00 · 2025-08-21 00:42:58 -04:00 · fb28da16c1
commit fb28da16c1
parent 832609959d
13 changed files with 563 additions and 43 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -42,6 +42,28 @@ Bottom level categories:

 ### Major Changes

+#### Deferred command buffer actions: `map_buffer_on_submit` and `on_submitted_work_done`
+
+You may schedule buffer mapping and a submission-complete callback to run automatically after you submit, directly from encoders, command buffers, and passes. 
+
+```rust
+// Record some GPU work so the submission isn't empty and touches `buffer`.
+encoder.clear_buffer(&buffer, 0, None);
+
+// Defer mapping until this encoder is submitted.
+encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 0..size, |result| { .. });
+
+// Fires after the command buffer's work is finished.
+encoder.on_submitted_work_done(|| { .. });
+
+// Automatically calls `map_async` and `on_submitted_work_done` after this submission finishes.
+queue.submit([encoder.finish()]);
+```
+
+Available on `CommandEncoder`, `CommandBuffer`, `RenderPass`, and `ComputePass`.
+
+By @cwfitzgerald in [#8125](https://github.com/gfx-rs/wgpu/pull/8125).
+
 #### `EXPERIMENTAL_RAY_TRACING_ACCELERATION_STRUCTURE` has been merged into `EXPERIMENTAL_RAY_QUERY`

 We have merged the acceleration structure feature into the `RayQuery` feature. This is to help work around an AMD driver bug and reduce the feature complexity of ray tracing. In the future when ray tracing pipelines are implemented, if either feature is enabled, acceleration structures will be available.
--- a/tests/tests/wgpu-validation/api/command_buffer_actions.rs
+++ b/tests/tests/wgpu-validation/api/command_buffer_actions.rs
@ -0,0 +1,261 @@
+use std::sync::atomic::{AtomicBool, AtomicU32, Ordering::SeqCst};
+use std::sync::Arc;
+
+/// Helper to create a small mappable buffer for READ tests.
+fn make_read_buffer(device: &wgpu::Device, size: u64) -> wgpu::Buffer {
+    device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("read buffer"),
+        size,
+        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+        mapped_at_creation: false,
+    })
+}
+
+/// map_buffer_on_submit defers mapping until submit, then invokes the callback after polling.
+#[test]
+fn encoder_map_buffer_on_submit_defers_until_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    let fired = Arc::new(AtomicBool::new(false));
+    let fired_cl = Arc::clone(&fired);
+
+    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+        label: Some("encoder"),
+    });
+
+    // Register deferred map.
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 0..4, move |_| {
+        fired_cl.store(true, SeqCst);
+    });
+    // Include a trivial command that uses the buffer.
+    encoder.clear_buffer(&buffer, 0, None);
+
+    // Polling before submit should not trigger the callback.
+    _ = device.poll(wgpu::PollType::Poll);
+    assert!(!fired.load(SeqCst));
+
+    // Submit and wait; callback should fire.
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+    assert!(fired.load(SeqCst));
+}
+
+/// Empty ranges panic immediately when registering the deferred map.
+#[test]
+#[should_panic = "buffer slices can not be empty"]
+fn encoder_map_buffer_on_submit_empty_range_panics_immediately() {
+    let (device, _queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    let encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+    // This panics inside map_buffer_on_submit (range_to_offset_size).
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 8..8, |_| {});
+}
+
+/// Out-of-bounds ranges panic during submit (when the deferred map executes).
+#[test]
+#[should_panic = "is out of range for buffer of size"]
+fn encoder_map_buffer_on_submit_out_of_bounds_panics_on_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    // 12..24 overflows the 16-byte buffer (size=12, end=24).
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 12..24, |_| {});
+    encoder.clear_buffer(&buffer, 0, None);
+
+    // Panic happens inside submit when executing deferred actions.
+    queue.submit([encoder.finish()]);
+}
+
+/// If the buffer is already mapped when the deferred mapping executes, it panics during submit.
+#[test]
+#[should_panic = "Buffer with 'read buffer' label is still mapped"]
+fn encoder_map_buffer_on_submit_panics_if_already_mapped_on_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    // Start a mapping now so the buffer is considered mapped.
+    buffer.slice(0..4).map_async(wgpu::MapMode::Read, |_| {});
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    // Deferred mapping of an already-mapped buffer will panic when executed on submit or be rejected by submit.
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 0..4, |_| {});
+    // Include any trivial work; using the same buffer ensures core validation catches the mapped hazard.
+    encoder.clear_buffer(&buffer, 0, None);
+
+    queue.submit([encoder.finish()]);
+}
+
+/// on_submitted_work_done is deferred until submit.
+#[test]
+fn encoder_on_submitted_work_done_defers_until_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+
+    let fired = Arc::new(AtomicBool::new(false));
+    let fired_cl = Arc::clone(&fired);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+    encoder.on_submitted_work_done(move || {
+        fired_cl.store(true, SeqCst);
+    });
+
+    // Include a trivial command so the command buffer isn't completely empty.
+    let dummy = make_read_buffer(&device, 4);
+    encoder.clear_buffer(&dummy, 0, None);
+
+    // Without submission, polling shouldn't invoke the callback.
+    _ = device.poll(wgpu::PollType::Poll);
+    assert!(!fired.load(SeqCst));
+
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+    assert!(fired.load(SeqCst));
+}
+
+/// Both kinds of deferred callbacks are enqueued and eventually invoked.
+#[test]
+fn encoder_both_callbacks_fire_after_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    let map_fired = Arc::new(AtomicBool::new(false));
+    let map_fired_cl = Arc::clone(&map_fired);
+    let queue_fired = Arc::new(AtomicBool::new(false));
+    let queue_fired_cl = Arc::clone(&queue_fired);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 0..4, move |_| {
+        map_fired_cl.store(true, SeqCst);
+    });
+    encoder.on_submitted_work_done(move || {
+        queue_fired_cl.store(true, SeqCst);
+    });
+    encoder.clear_buffer(&buffer, 0, None);
+
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+
+    assert!(map_fired.load(SeqCst));
+    assert!(queue_fired.load(SeqCst));
+}
+
+/// Registering multiple deferred mappings works; all callbacks fire after submit.
+#[test]
+fn encoder_multiple_map_buffer_on_submit_callbacks_fire() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer1 = make_read_buffer(&device, 32);
+    let buffer2 = make_read_buffer(&device, 32);
+
+    let counter = Arc::new(AtomicU32::new(0));
+    let c1 = Arc::clone(&counter);
+    let c2 = Arc::clone(&counter);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    encoder.map_buffer_on_submit(&buffer1, wgpu::MapMode::Read, 0..4, move |_| {
+        c1.fetch_add(1, SeqCst);
+    });
+    encoder.map_buffer_on_submit(&buffer2, wgpu::MapMode::Read, 8..12, move |_| {
+        c2.fetch_add(1, SeqCst);
+    });
+    encoder.clear_buffer(&buffer1, 0, None);
+
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+
+    assert_eq!(counter.load(SeqCst), 2);
+}
+
+/// Mapping with a buffer lacking MAP_* usage should panic when executed on submit.
+#[test]
+#[should_panic]
+fn encoder_map_buffer_on_submit_panics_if_usage_invalid_on_submit() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let unmappable = device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("unmappable buffer"),
+        size: 16,
+        usage: wgpu::BufferUsages::COPY_DST, // No MAP_READ or MAP_WRITE
+        mapped_at_creation: false,
+    });
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    encoder.map_buffer_on_submit(&unmappable, wgpu::MapMode::Read, 0..4, |_| {});
+
+    // Add unrelated work so the submission isn't empty.
+    let dummy = make_read_buffer(&device, 4);
+    encoder.clear_buffer(&dummy, 0, None);
+
+    // Panic expected when deferred mapping executes.
+    queue.submit([encoder.finish()]);
+}
+
+/// Deferred map callbacks run before on_submitted_work_done for the same submission.
+#[test]
+fn encoder_deferred_map_runs_before_on_submitted_work_done() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 16);
+
+    #[derive(Default)]
+    struct Order {
+        map_order: AtomicU32,
+        queue_order: AtomicU32,
+        counter: AtomicU32,
+    }
+    let order = Arc::new(Order::default());
+    let o_map = Arc::clone(&order);
+    let o_queue = Arc::clone(&order);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    encoder.map_buffer_on_submit(&buffer, wgpu::MapMode::Read, 0..4, move |_| {
+        let v = o_map.counter.fetch_add(1, SeqCst);
+        o_map.map_order.store(v, SeqCst);
+    });
+    encoder.on_submitted_work_done(move || {
+        let v = o_queue.counter.fetch_add(1, SeqCst);
+        o_queue.queue_order.store(v, SeqCst);
+    });
+    encoder.clear_buffer(&buffer, 0, None);
+
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+
+    assert_eq!(order.counter.load(SeqCst), 2);
+    assert_eq!(order.map_order.load(SeqCst), 0);
+    assert_eq!(order.queue_order.load(SeqCst), 1);
+}
+
+/// Multiple on_submitted_work_done callbacks registered on encoder all fire after submit.
+#[test]
+fn encoder_multiple_on_submitted_callbacks_fire() {
+    let (device, queue) = wgpu::Device::noop(&wgpu::DeviceDescriptor::default());
+    let buffer = make_read_buffer(&device, 4);
+
+    let counter = Arc::new(AtomicU32::new(0));
+    let c1 = Arc::clone(&counter);
+    let c2 = Arc::clone(&counter);
+
+    let mut encoder =
+        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+    encoder.on_submitted_work_done(move || {
+        c1.fetch_add(1, SeqCst);
+    });
+    encoder.on_submitted_work_done(move || {
+        c2.fetch_add(1, SeqCst);
+    });
+    encoder.clear_buffer(&buffer, 0, None);
+
+    queue.submit([encoder.finish()]);
+    _ = device.poll(wgpu::PollType::Wait);
+
+    assert_eq!(counter.load(SeqCst), 2);
+}
--- a/tests/tests/wgpu-validation/api/mod.rs
+++ b/tests/tests/wgpu-validation/api/mod.rs
@ -2,6 +2,7 @@ mod binding_arrays;
 mod buffer;
 mod buffer_mapping;
 mod buffer_slice;
+mod command_buffer_actions;
 mod device;
 mod experimental;
 mod external_texture;
--- a/wgpu/src/api/buffer.rs
+++ b/wgpu/src/api/buffer.rs
@ -347,20 +347,28 @@ impl Buffer {
        self.usage
    }

-    /// Map the buffer to host (CPU) memory, making it available for reading or writing
-    /// via [`get_mapped_range()`](Self::get_mapped_range).
-    /// It is available once the `callback` is called with an [`Ok`] response.
+    /// Map the buffer to host (CPU) memory, making it available for reading or writing via
+    /// [`get_mapped_range()`](Self::get_mapped_range). The buffer becomes accessible once the
+    /// `callback` is invoked with [`Ok`].
    ///
-    /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
-    /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.
+    /// Use this when you want to map the buffer immediately. If you need to submit GPU work that
+    /// uses the buffer before mapping it, use `map_buffer_on_submit` on
+    /// [`CommandEncoder`][CEmbos], [`CommandBuffer`][CBmbos], [`RenderPass`][RPmbos], or
+    /// [`ComputePass`][CPmbos] to schedule the mapping after submission. This avoids extra calls to
+    /// [`Buffer::map_async()`] or [`BufferSlice::map_async()`] and lets you initiate mapping from a
+    /// more convenient place.
    ///
-    /// The callback will be called on the thread that first calls the above functions after the GPU work
-    /// has completed. There are no restrictions on the code you can run in the callback, however on native the
-    /// call to the function will not complete until the callback returns, so prefer keeping callbacks short
-    /// and used to set flags, send messages, etc.
+    /// For the callback to run, either [`queue.submit(..)`][q::s], [`instance.poll_all(..)`][i::p_a],
+    /// or [`device.poll(..)`][d::p] must be called elsewhere in the runtime, possibly integrated into
+    /// an event loop or run on a separate thread.
    ///
-    /// As long as a buffer is mapped, it is not available for use by any other commands;
-    /// at all times, either the GPU or the CPU has exclusive access to the contents of the buffer.
+    /// The callback runs on the thread that first calls one of the above functions after the GPU work
+    /// completes. There are no restrictions on the code you can run in the callback; however, on native
+    /// the polling call will not return until the callback finishes, so keep callbacks short (set flags,
+    /// send messages, etc.).
+    ///
+    /// While a buffer is mapped, it cannot be used by other commands; at any time, either the GPU or
+    /// the CPU has exclusive access to the buffer’s contents.
    ///
    /// This can also be performed using [`BufferSlice::map_async()`].
    ///
@ -371,6 +379,14 @@ impl Buffer {
    /// - If `bounds` is outside of the bounds of `self`.
    /// - If `bounds` has a length less than 1.
    /// - If the start and end of `bounds` are not be aligned to [`MAP_ALIGNMENT`].
+    ///
+    /// [CEmbos]: CommandEncoder::map_buffer_on_submit
+    /// [CBmbos]: CommandBuffer::map_buffer_on_submit
+    /// [RPmbos]: RenderPass::map_buffer_on_submit
+    /// [CPmbos]: ComputePass::map_buffer_on_submit
+    /// [q::s]: Queue::submit
+    /// [i::p_a]: Instance::poll_all
+    /// [d::p]: Device::poll
    pub fn map_async<S: RangeBounds<BufferAddress>>(
        &self,
        mode: MapMode,
@ -508,20 +524,28 @@ impl<'a> BufferSlice<'a> {
        }
    }

-    /// Map the buffer to host (CPU) memory, making it available for reading or writing
-    /// via [`get_mapped_range()`](Self::get_mapped_range).
-    /// It is available once the `callback` is called with an [`Ok`] response.
+    /// Map the buffer to host (CPU) memory, making it available for reading or writing via
+    /// [`get_mapped_range()`](Self::get_mapped_range). The buffer becomes accessible once the
+    /// `callback` is invoked with [`Ok`].
    ///
-    /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
-    /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.
+    /// Use this when you want to map the buffer immediately. If you need to submit GPU work that
+    /// uses the buffer before mapping it, use `map_buffer_on_submit` on
+    /// [`CommandEncoder`][CEmbos], [`CommandBuffer`][CBmbos], [`RenderPass`][RPmbos], or
+    /// [`ComputePass`][CPmbos] to schedule the mapping after submission. This avoids extra calls to
+    /// [`Buffer::map_async()`] or [`BufferSlice::map_async()`] and lets you initiate mapping from a
+    /// more convenient place.
    ///
-    /// The callback will be called on the thread that first calls the above functions after the GPU work
-    /// has completed. There are no restrictions on the code you can run in the callback, however on native the
-    /// call to the function will not complete until the callback returns, so prefer keeping callbacks short
-    /// and used to set flags, send messages, etc.
+    /// For the callback to run, either [`queue.submit(..)`][q::s], [`instance.poll_all(..)`][i::p_a],
+    /// or [`device.poll(..)`][d::p] must be called elsewhere in the runtime, possibly integrated into
+    /// an event loop or run on a separate thread.
    ///
-    /// As long as a buffer is mapped, it is not available for use by any other commands;
-    /// at all times, either the GPU or the CPU has exclusive access to the contents of the buffer.
+    /// The callback runs on the thread that first calls one of the above functions after the GPU work
+    /// completes. There are no restrictions on the code you can run in the callback; however, on native
+    /// the polling call will not return until the callback finishes, so keep callbacks short (set flags,
+    /// send messages, etc.).
+    ///
+    /// While a buffer is mapped, it cannot be used by other commands; at any time, either the GPU or
+    /// the CPU has exclusive access to the buffer’s contents.
    ///
    /// This can also be performed using [`Buffer::map_async()`].
    ///
@ -530,6 +554,14 @@ impl<'a> BufferSlice<'a> {
    /// - If the buffer is already mapped.
    /// - If the buffer’s [`BufferUsages`] do not allow the requested [`MapMode`].
    /// - If the endpoints of this slice are not aligned to [`MAP_ALIGNMENT`] within the buffer.
+    ///
+    /// [CEmbos]: CommandEncoder::map_buffer_on_submit
+    /// [CBmbos]: CommandBuffer::map_buffer_on_submit
+    /// [RPmbos]: RenderPass::map_buffer_on_submit
+    /// [CPmbos]: ComputePass::map_buffer_on_submit
+    /// [q::s]: Queue::submit
+    /// [i::p_a]: Instance::poll_all
+    /// [d::p]: Device::poll
    pub fn map_async(
        &self,
        mode: MapMode,
@ -977,7 +1009,7 @@ fn check_buffer_bounds(
 }

 #[track_caller]
-fn range_to_offset_size<S: RangeBounds<BufferAddress>>(
+pub(crate) fn range_to_offset_size<S: RangeBounds<BufferAddress>>(
    bounds: S,
    whole_size: BufferAddress,
 ) -> (BufferAddress, BufferSize) {
--- a/wgpu/src/api/command_buffer.rs
+++ b/wgpu/src/api/command_buffer.rs
@ -1,4 +1,7 @@
-use crate::*;
+use crate::{
+    api::{impl_deferred_command_buffer_actions, SharedDeferredCommandBufferActions},
+    *,
+};

 /// Handle to a command buffer on the GPU.
 ///
@ -10,6 +13,8 @@ use crate::*;
 #[derive(Debug)]
 pub struct CommandBuffer {
    pub(crate) buffer: dispatch::DispatchCommandBuffer,
+    /// Deferred actions recorded at encode time, to run at Queue::submit.
+    pub(crate) actions: SharedDeferredCommandBufferActions,
 }
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(CommandBuffer: Send, Sync);
@ -20,4 +25,8 @@ impl CommandBuffer {
    pub fn as_custom<T: custom::CommandBufferInterface>(&self) -> Option<&T> {
        self.buffer.as_custom()
    }
+
+    // Expose map_buffer_on_submit/on_submitted_work_done on CommandBuffer as well,
+    // so callers can schedule after finishing encoding.
+    impl_deferred_command_buffer_actions!();
 }
--- a/wgpu/src/api/command_buffer_actions.rs
+++ b/wgpu/src/api/command_buffer_actions.rs
@ -0,0 +1,147 @@
+use alloc::{sync::Arc, vec::Vec};
+use core::num::NonZeroU64;
+
+use crate::{util::Mutex, *};
+
+/// A deferred buffer mapping request captured during encoding (or a pass)
+/// and executed later when the command buffer is submitted.
+pub(crate) struct DeferredBufferMapping {
+    pub buffer: api::Buffer,
+    pub mode: MapMode,
+    pub offset: u64,
+    pub size: NonZeroU64,
+    pub callback: dispatch::BufferMapCallback,
+}
+
+pub(super) type SharedDeferredCommandBufferActions = Arc<Mutex<DeferredCommandBufferActions>>;
+
+/// Set of actions to take when the command buffer is submitted.
+#[derive(Default)]
+pub(crate) struct DeferredCommandBufferActions {
+    pub buffer_mappings: Vec<DeferredBufferMapping>,
+    pub on_submitted_work_done_callbacks: Vec<dispatch::BoxSubmittedWorkDoneCallback>,
+}
+
+impl DeferredCommandBufferActions {
+    pub fn append(&mut self, other: &mut Self) {
+        self.buffer_mappings.append(&mut other.buffer_mappings);
+        self.on_submitted_work_done_callbacks
+            .append(&mut other.on_submitted_work_done_callbacks);
+    }
+
+    pub fn execute(self, queue: &dispatch::DispatchQueue) {
+        for mapping in self.buffer_mappings {
+            mapping.buffer.map_async(
+                mapping.mode,
+                mapping.offset..mapping.offset + mapping.size.get(),
+                mapping.callback,
+            );
+        }
+        for callback in self.on_submitted_work_done_callbacks {
+            queue.on_submitted_work_done(callback);
+        }
+    }
+}
+
+impl core::fmt::Debug for DeferredCommandBufferActions {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("DeferredCommandBufferActions")
+            .field("buffer_mappings.len()", &self.buffer_mappings.len())
+            .field(
+                "on_submitted_work_done_callbacks.len()",
+                &self.on_submitted_work_done_callbacks.len(),
+            )
+            .finish()
+    }
+}
+
+// We can't just implement this on CommandEncoders as by default passes make it so that
+// you can't call any commands on the encoder while this is happening. As such, we need
+// to implement these methods on the passes too. Use a macro to avoid massive code duplication
+macro_rules! impl_deferred_command_buffer_actions {
+    () => {
+        /// On submission, maps the buffer to host (CPU) memory, making it available
+        /// for reading or writing via [`get_mapped_range()`](Buffer::get_mapped_range).
+        /// The buffer becomes accessible once the `callback` is invoked with [`Ok`].
+        ///
+        /// Use this when you need to submit work that uses the buffer before mapping it.
+        /// Because that submission must happen before calling `map_async`, this method
+        /// schedules the mapping for after submission, avoiding extra calls to
+        /// [`Buffer::map_async()`] or [`BufferSlice::map_async()`] and letting you start
+        /// the mapping from a more convenient place.
+        ///
+        /// For the callback to run, either [`queue.submit(..)`][q::s], [`instance.poll_all(..)`][i::p_a],
+        /// or [`device.poll(..)`][d::p] must be called elsewhere in the runtime, possibly integrated
+        /// into an event loop or run on a separate thread.
+        ///
+        /// The callback runs on the thread that first calls one of the above functions
+        /// after the GPU work completes. There are no restrictions on the code you can run
+        /// in the callback; however, on native the polling call will not return until the
+        /// callback finishes, so keep callbacks short (set flags, send messages, etc.).
+        ///
+        /// While a buffer is mapped, it cannot be used by other commands; at any time,
+        /// either the GPU or the CPU has exclusive access to the buffer’s contents.
+        ///
+        /// # Panics
+        ///
+        /// - If `bounds` is outside the bounds of `buffer`.
+        /// - If `bounds` has a length less than 1.
+        ///
+        /// # Panics During Submit
+        ///
+        /// - If the buffer is already mapped.
+        /// - If the buffer’s [`BufferUsages`] do not allow the requested [`MapMode`].
+        /// - If the endpoints of this slice are not aligned to [`MAP_ALIGNMENT`] within the buffer.
+        ///
+        /// [q::s]: Queue::submit
+        /// [i::p_a]: Instance::poll_all
+        /// [d::p]: Device::poll
+        /// [CEmbos]: CommandEncoder::map_buffer_on_submit
+        /// [CBmbos]: CommandBuffer::map_buffer_on_submit
+        /// [RPmbos]: RenderPass::map_buffer_on_submit
+        /// [CPmbos]: ComputePass::map_buffer_on_submit
+        pub fn map_buffer_on_submit<S: core::ops::RangeBounds<BufferAddress>>(
+            &self,
+            buffer: &api::Buffer,
+            mode: MapMode,
+            bounds: S,
+            callback: impl FnOnce(Result<(), BufferAsyncError>) + WasmNotSend + 'static,
+        ) {
+            let (offset, size) = range_to_offset_size(bounds, buffer.size);
+            self.actions.lock().buffer_mappings.push(
+                crate::api::command_buffer_actions::DeferredBufferMapping {
+                    buffer: buffer.clone(),
+                    mode,
+                    offset,
+                    size,
+                    callback: alloc::boxed::Box::new(callback),
+                },
+            );
+        }
+
+        /// Registers a callback that is invoked when this command buffer’s work finishes
+        /// executing on the GPU. When this callback runs, all mapped-buffer callbacks
+        /// registered for the same submission are guaranteed to have been called.
+        ///
+        /// For the callback to run, either [`queue.submit(..)`][q::s], [`instance.poll_all(..)`][i::p_a],
+        /// or [`device.poll(..)`][d::p] must be called elsewhere in the runtime, possibly integrated
+        /// into an event loop or run on a separate thread.
+        ///
+        /// The callback runs on the thread that first calls one of the above functions
+        /// after the GPU work completes. There are no restrictions on the code you can run
+        /// in the callback; however, on native the polling call will not return until the
+        /// callback finishes, so keep callbacks short (set flags, send messages, etc.).
+        ///
+        /// [q::s]: Queue::submit
+        /// [i::p_a]: Instance::poll_all
+        /// [d::p]: Device::poll
+        pub fn on_submitted_work_done(&self, callback: impl FnOnce() + Send + 'static) {
+            self.actions
+                .lock()
+                .on_submitted_work_done_callbacks
+                .push(alloc::boxed::Box::new(callback));
+        }
+    };
+}
+
+pub(crate) use impl_deferred_command_buffer_actions;
--- a/wgpu/src/api/command_encoder.rs
+++ b/wgpu/src/api/command_encoder.rs
@ -1,7 +1,11 @@
+use alloc::sync::Arc;
 use core::ops::Range;

 use crate::{
-    api::{blas::BlasBuildEntry, tlas::Tlas},
+    api::{
+        blas::BlasBuildEntry, impl_deferred_command_buffer_actions, tlas::Tlas,
+        SharedDeferredCommandBufferActions,
+    },
    *,
 };

@ -17,6 +21,7 @@ use crate::{
 #[derive(Debug)]
 pub struct CommandEncoder {
    pub(crate) inner: dispatch::DispatchCommandEncoder,
+    pub(crate) actions: SharedDeferredCommandBufferActions,
 }
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(CommandEncoder: Send, Sync);
@ -52,10 +57,10 @@ static_assertions::assert_impl_all!(TexelCopyTextureInfo<'_>: Send, Sync);

 impl CommandEncoder {
    /// Finishes recording and returns a [`CommandBuffer`] that can be submitted for execution.
-    pub fn finish(mut self) -> CommandBuffer {
-        let buffer = self.inner.finish();
-
-        CommandBuffer { buffer }
+    pub fn finish(self) -> CommandBuffer {
+        let Self { mut inner, actions } = self;
+        let buffer = inner.finish();
+        CommandBuffer { buffer, actions }
    }

    /// Begins recording of a render pass.
@ -75,6 +80,7 @@ impl CommandEncoder {
        let rpass = self.inner.begin_render_pass(desc);
        RenderPass {
            inner: rpass,
+            actions: Arc::clone(&self.actions),
            _encoder_guard: api::PhantomDrop::default(),
        }
    }
@ -96,6 +102,7 @@ impl CommandEncoder {
        let cpass = self.inner.begin_compute_pass(desc);
        ComputePass {
            inner: cpass,
+            actions: Arc::clone(&self.actions),
            _encoder_guard: api::PhantomDrop::default(),
        }
    }
@ -232,6 +239,8 @@ impl CommandEncoder {
        );
    }

+    impl_deferred_command_buffer_actions!();
+
    /// Get the [`wgpu_hal`] command encoder from this `CommandEncoder`.
    ///
    /// The returned command encoder will be ready to record onto.
--- a/wgpu/src/api/compute_pass.rs
+++ b/wgpu/src/api/compute_pass.rs
@ -1,4 +1,7 @@
-use crate::*;
+use crate::{
+    api::{impl_deferred_command_buffer_actions, SharedDeferredCommandBufferActions},
+    *,
+};

 /// In-progress recording of a compute pass.
 ///
@ -10,6 +13,9 @@ use crate::*;
 pub struct ComputePass<'encoder> {
    pub(crate) inner: dispatch::DispatchComputePass,

+    /// Shared with CommandEncoder to enqueue deferred actions from within a pass.
+    pub(crate) actions: SharedDeferredCommandBufferActions,
+
    /// This lifetime is used to protect the [`CommandEncoder`] from being used
    /// while the pass is alive. This needs to be PhantomDrop to prevent the lifetime
    /// from being shortened.
@ -37,6 +43,7 @@ impl ComputePass<'_> {
    pub fn forget_lifetime(self) -> ComputePass<'static> {
        ComputePass {
            inner: self.inner,
+            actions: self.actions,
            _encoder_guard: crate::api::PhantomDrop::default(),
        }
    }
@ -95,6 +102,8 @@ impl ComputePass<'_> {
            .dispatch_workgroups_indirect(&indirect_buffer.inner, indirect_offset);
    }

+    impl_deferred_command_buffer_actions!();
+
    #[cfg(custom)]
    /// Returns custom implementation of ComputePass (if custom backend and is internally T)
    pub fn as_custom<T: custom::ComputePassInterface>(&self) -> Option<&T> {
--- a/wgpu/src/api/device.rs
+++ b/wgpu/src/api/device.rs
@ -198,7 +198,12 @@ impl Device {
    #[must_use]
    pub fn create_command_encoder(&self, desc: &CommandEncoderDescriptor<'_>) -> CommandEncoder {
        let encoder = self.inner.create_command_encoder(desc);
-        CommandEncoder { inner: encoder }
+        // Each encoder starts with its own deferred-action store that travels
+        // with the CommandBuffer produced by finish().
+        CommandEncoder {
+            inner: encoder,
+            actions: Default::default(),
+        }
    }

    /// Creates an empty [`RenderBundleEncoder`].
--- a/wgpu/src/api/mod.rs
+++ b/wgpu/src/api/mod.rs
@ -26,6 +26,8 @@ mod bind_group_layout;
 mod blas;
 mod buffer;
 mod command_buffer;
+/// Not a root type, but common types for command buffer deferral actions.
+mod command_buffer_actions;
 mod command_encoder;
 // Not a root type, but common descriptor types for pipelines.
 mod common_pipeline;
@ -56,6 +58,7 @@ pub use bind_group_layout::*;
 pub use blas::*;
 pub use buffer::*;
 pub use command_buffer::*;
+use command_buffer_actions::*;
 pub use command_encoder::*;
 pub use common_pipeline::*;
 pub use compute_pass::*;
--- a/wgpu/src/api/queue.rs
+++ b/wgpu/src/api/queue.rs
@ -1,7 +1,7 @@
 use alloc::boxed::Box;
 use core::ops::{Deref, DerefMut};

-use crate::*;
+use crate::{api::DeferredCommandBufferActions, *};

 /// Handle to a command queue on a device.
 ///
@ -248,10 +248,19 @@ impl Queue {
        &self,
        command_buffers: I,
    ) -> SubmissionIndex {
-        let mut command_buffers = command_buffers.into_iter().map(|comb| comb.buffer);
+        // As submit drains the iterator (even on error), collect deferred actions
+        // from each CommandBuffer along the way.
+        let mut actions = DeferredCommandBufferActions::default();

+        let mut command_buffers = command_buffers.into_iter().map(|comb| {
+            actions.append(&mut comb.actions.lock());
+            comb.buffer
+        });
        let index = self.inner.submit(&mut command_buffers);

+        // Execute all deferred actions after submit.
+        actions.execute(&self.inner);
+
        SubmissionIndex { index }
    }

@ -265,17 +274,22 @@ impl Queue {
        self.inner.get_timestamp_period()
    }

-    /// Registers a callback when the previous call to submit finishes running on the gpu. This callback
-    /// being called implies that all mapped buffer callbacks which were registered before this call will
-    /// have been called.
+    /// Registers a callback that is invoked when the previous [`Queue::submit`] finishes executing
+    /// on the GPU. When this callback runs, all mapped-buffer callbacks registered for the same
+    /// submission are guaranteed to have been called.
    ///
-    /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
-    /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.
+    /// For the callback to run, either [`queue.submit(..)`][q::s], [`instance.poll_all(..)`][i::p_a],
+    /// or [`device.poll(..)`][d::p] must be called elsewhere in the runtime, possibly integrated into
+    /// an event loop or run on a separate thread.
    ///
-    /// The callback will be called on the thread that first calls the above functions after the gpu work
-    /// has completed. There are no restrictions on the code you can run in the callback, however on native the
-    /// call to the function will not complete until the callback returns, so prefer keeping callbacks short
-    /// and used to set flags, send messages, etc.
+    /// The callback runs on the thread that first calls one of the above functions after the GPU work
+    /// completes. There are no restrictions on the code you can run in the callback; however, on native
+    /// the polling call will not return until the callback finishes, so keep callbacks short (set flags,
+    /// send messages, etc.).
+    ///
+    /// [q::s]: Queue::submit
+    /// [i::p_a]: Instance::poll_all
+    /// [d::p]: Device::poll
    pub fn on_submitted_work_done(&self, callback: impl FnOnce() + Send + 'static) {
        self.inner.on_submitted_work_done(Box::new(callback));
    }
--- a/wgpu/src/api/render_pass.rs
+++ b/wgpu/src/api/render_pass.rs
@ -1,6 +1,9 @@
 use core::ops::Range;

-use crate::*;
+use crate::{
+    api::{impl_deferred_command_buffer_actions, SharedDeferredCommandBufferActions},
+    *,
+};
 pub use wgt::{LoadOp, Operations, StoreOp};

 /// In-progress recording of a render pass: a list of render commands in a [`CommandEncoder`].
@ -24,6 +27,7 @@ pub use wgt::{LoadOp, Operations, StoreOp};
 #[derive(Debug)]
 pub struct RenderPass<'encoder> {
    pub(crate) inner: dispatch::DispatchRenderPass,
+    pub(crate) actions: SharedDeferredCommandBufferActions,

    /// This lifetime is used to protect the [`CommandEncoder`] from being used
    /// while the pass is alive. This needs to be PhantomDrop to prevent the lifetime
@ -52,6 +56,7 @@ impl RenderPass<'_> {
    pub fn forget_lifetime(self) -> RenderPass<'static> {
        RenderPass {
            inner: self.inner,
+            actions: self.actions,
            _encoder_guard: crate::api::PhantomDrop::default(),
        }
    }
@ -278,6 +283,8 @@ impl RenderPass<'_> {
            .draw_mesh_tasks_indirect(&indirect_buffer.inner, indirect_offset);
    }

+    impl_deferred_command_buffer_actions!();
+
    /// Execute a [render bundle][RenderBundle], which is a set of pre-recorded commands
    /// that can be run together.
    ///
--- a/wgpu/src/dispatch.rs
+++ b/wgpu/src/dispatch.rs
@ -232,6 +232,7 @@ pub trait QueueInterface: CommonTraits {
        size: crate::Extent3d,
    );

+    /// Submit must always drain the iterator, even in the case of error.
    fn submit(&self, command_buffers: &mut dyn Iterator<Item = DispatchCommandBuffer>) -> u64;

    fn get_timestamp_period(&self) -> f32;