From 7aa57537016c732ca62bbb88bee5674ad29f19f0 Mon Sep 17 00:00:00 2001
From: Dzmitry Malyshau <kvarkus@gmail.com>
Date: Wed, 10 Jun 2020 09:32:51 -0400
Subject: [PATCH] Re-architecture the bundles using normalized command streams.

This is a major change in how the bundles are implemented. Instead of
transparently injecting them into the pass command stream, we are now
treating bundles as first-class API objects and API tracing them
accordingly. The bundle contains a normalized command stream that is
very easy to inject into a native command buffer multiple times.
---
 player/src/main.rs               |  54 +--
 wgpu-core/src/command/bind.rs    |  11 +-
 wgpu-core/src/command/bundle.rs  | 720 +++++++++++++++++++++++++++----
 wgpu-core/src/command/compute.rs |  21 +-
 wgpu-core/src/command/mod.rs     |  10 +-
 wgpu-core/src/command/render.rs  | 155 ++++---
 wgpu-core/src/device/life.rs     |  49 ++-
 wgpu-core/src/device/mod.rs      |  46 +-
 wgpu-core/src/device/trace.rs    |  48 ++-
 wgpu-core/src/hub.rs             |   3 +-
 wgpu-core/src/pipeline.rs        |   1 -
 wgpu-core/src/track/mod.rs       |  10 +-
 wgpu-types/src/lib.rs            |  16 +-
 13 files changed, 910 insertions(+), 234 deletions(-)
diff --git a/player/src/main.rs b/player/src/main.rs
index 3d360eeba..4a7ba5d2b 100644
--- a/player/src/main.rs
+++ b/player/src/main.rs
@@ -143,19 +143,8 @@ impl GlobalExt for wgc::hub::Global<IdentityPassThroughFactory> {
                     commands,
                     dynamic_offsets,
                 } => unsafe {
-                    let mut offsets = &dynamic_offsets[..];
                     let mut pass = wgc::command::RawPass::new_compute(encoder);
-                    for com in commands {
-                        pass.encode(&com);
-                        if let wgc::command::ComputeCommand::SetBindGroup {
-                            num_dynamic_offsets,
-                            ..
-                        } = com
-                        {
-                            pass.encode_slice(&offsets[..num_dynamic_offsets as usize]);
-                            offsets = &offsets[num_dynamic_offsets as usize..];
-                        }
-                    }
+                    pass.fill_compute_commands(&commands, &dynamic_offsets);
                     let (data, _) = pass.finish_compute();
                     self.command_encoder_run_compute_pass::<B>(encoder, &data);
                 },
@@ -165,7 +154,6 @@ impl GlobalExt for wgc::hub::Global<IdentityPassThroughFactory> {
                     commands,
                     dynamic_offsets,
                 } => unsafe {
-                    let mut offsets = &dynamic_offsets[..];
                     let mut pass = wgc::command::RawPass::new_render(
                         encoder,
                         &wgc::command::RenderPassDescriptor {
@@ -174,17 +162,7 @@ impl GlobalExt for wgc::hub::Global<IdentityPassThroughFactory> {
                             depth_stencil_attachment: target_depth_stencil.as_ref(),
                         },
                     );
-                    for com in commands {
-                        pass.encode(&com);
-                        if let wgc::command::RenderCommand::SetBindGroup {
-                            num_dynamic_offsets,
-                            ..
-                        } = com
-                        {
-                            pass.encode_slice(&offsets[..num_dynamic_offsets as usize]);
-                            offsets = &offsets[num_dynamic_offsets as usize..];
-                        }
-                    }
+                    pass.fill_render_commands(&commands, &dynamic_offsets);
                     let (data, _) = pass.finish_render();
                     self.command_encoder_run_render_pass::<B>(encoder, &data);
                 },
@@ -408,6 +386,34 @@ impl GlobalExt for wgc::hub::Global<IdentityPassThroughFactory> {
             A::DestroyRenderPipeline(id) => {
                 self.render_pipeline_destroy::<B>(id);
             }
+            A::CreateRenderBundle {
+                id,
+                desc,
+                commands,
+                dynamic_offsets,
+            } => {
+                let label = Label::new(&desc.label);
+                let mut bundle_encoder = wgc::command::RenderBundleEncoder::new(
+                    &wgt::RenderBundleEncoderDescriptor {
+                        label: None,
+                        color_formats: &desc.color_formats,
+                        depth_stencil_format: desc.depth_stencil_format,
+                        sample_count: desc.sample_count,
+                    },
+                    device,
+                );
+                bundle_encoder.fill_commands(&commands, &dynamic_offsets);
+                self.render_bundle_encoder_finish::<B>(
+                    bundle_encoder,
+                    &wgt::RenderBundleDescriptor {
+                        label: label.as_ptr(),
+                    },
+                    id,
+                );
+            }
+            A::DestroyRenderBundle(id) => {
+                self.render_bundle_destroy::<B>(id);
+            }
             A::WriteBuffer {
                 id,
                 data,
diff --git a/wgpu-core/src/command/bind.rs b/wgpu-core/src/command/bind.rs
index 1c2d5724e..375f0b78b 100644
--- a/wgpu-core/src/command/bind.rs
+++ b/wgpu-core/src/command/bind.rs
@@ -6,14 +6,13 @@ use crate::{
     binding_model::BindGroup,
     hub::GfxBackend,
     id::{BindGroupId, BindGroupLayoutId, PipelineLayoutId},
-    Stored,
+    Stored, MAX_BIND_GROUPS,
 };
 
-use smallvec::{smallvec, SmallVec};
+use arrayvec::ArrayVec;
 use std::slice;
 use wgt::DynamicOffset;
 
-pub const DEFAULT_BIND_GROUPS: usize = 4;
 type BindGroupMask = u8;
 
 #[derive(Clone, Debug)]
@@ -134,14 +133,16 @@ impl BindGroupEntry {
 #[derive(Debug)]
 pub struct Binder {
     pub(crate) pipeline_layout_id: Option<PipelineLayoutId>, //TODO: strongly `Stored`
-    pub(crate) entries: SmallVec<[BindGroupEntry; DEFAULT_BIND_GROUPS]>,
+    pub(crate) entries: ArrayVec<[BindGroupEntry; MAX_BIND_GROUPS]>,
 }
 
 impl Binder {
     pub(crate) fn new(max_bind_groups: u32) -> Self {
         Self {
             pipeline_layout_id: None,
-            entries: smallvec![Default::default(); max_bind_groups as usize],
+            entries: (0..max_bind_groups)
+                .map(|_| BindGroupEntry::default())
+                .collect(),
         }
     }
 
diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs
index 3b27ad4f8..3de385a7c 100644
--- a/wgpu-core/src/command/bundle.rs
+++ b/wgpu-core/src/command/bundle.rs
@@ -2,39 +2,75 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+/*! Render Bundles
+
+    ## Software implementation
+
+    The path from nothing to using a render bundle consists of 3 phases.
+
+    ### Initial command encoding
+
+    User creates a `RenderBundleEncoder` and populates it by issuing commands
+    from `bundle_ffi` module, just like with `RenderPass`, except that the
+    set of available commands is reduced. Everything is written into a `RawPass`.
+
+    ### Bundle baking
+
+    Once the commands are encoded, user calls `render_bundle_encoder_finish`.
+    This is perhaps the most complex part of the logic. It consumes the
+    commands stored in `RawPass`, while validating everything, tracking the state,
+    and re-recording the commands into a separate `Vec<RenderCommand>`. It
+    doesn't actually execute any commands.
+
+    What's more important, is that the produced vector of commands is "normalized",
+    which means it can be executed verbatim without any state tracking. More
+    formally, "normalized" command stream guarantees that any state required by
+    a draw call is set explicitly by one of the commands between the draw call
+    and the last changing of the pipeline.
+
+    ### Execution
+
+    When the bundle is used in an actual render pass, `RenderBundle::execute` is
+    called. It goes through the commands and issues them into the native command
+    buffer. Thanks to the "normalized" property, it doesn't track any bind group
+    invalidations or index format changes.
+!*/
+
 use crate::{
-    command::{RawPass, RenderCommand},
+    command::{PhantomSlice, RawPass, RenderCommand},
     conv,
-    device::{Label, RenderPassContext},
-    hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Input, Token},
+    device::{AttachmentData, Label, RenderPassContext, MAX_VERTEX_BUFFERS},
+    hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Input, Storage, Token},
     id,
     resource::BufferUse,
     track::TrackerSet,
-    LifeGuard, RefCount,
+    LifeGuard, RefCount, Stored, MAX_BIND_GROUPS,
 };
 use arrayvec::ArrayVec;
 use peek_poke::{Peek, Poke};
+use std::{borrow::Borrow, iter, marker::PhantomData, ops::Range};
 
 #[derive(Debug)]
 pub struct RenderBundleEncoder {
-    pub(crate) raw: RawPass<id::DeviceId>,
+    raw: RawPass<id::DeviceId>,
     pub(crate) context: RenderPassContext,
-    pub(crate) sample_count: u8,
 }
 
 impl RenderBundleEncoder {
     pub fn new(desc: &wgt::RenderBundleEncoderDescriptor, device_id: id::DeviceId) -> Self {
         RenderBundleEncoder {
-            raw: RawPass::from_vec::<RenderCommand>(Vec::with_capacity(1), device_id),
+            raw: RawPass::new::<RenderCommand>(device_id),
             context: RenderPassContext {
-                colors: desc.color_formats.iter().cloned().collect(),
-                resolves: ArrayVec::new(),
-                depth_stencil: desc.depth_stencil_format,
-            },
-            sample_count: {
-                let sc = desc.sample_count;
-                assert!(sc != 0 && sc <= 32 && conv::is_power_of_two(sc));
-                sc as u8
+                attachments: AttachmentData {
+                    colors: desc.color_formats.iter().cloned().collect(),
+                    resolves: ArrayVec::new(),
+                    depth_stencil: desc.depth_stencil_format,
+                },
+                sample_count: {
+                    let sc = desc.sample_count;
+                    assert!(sc != 0 && sc <= 32 && conv::is_power_of_two(sc));
+                    sc as u8
+                },
             },
         }
     }
@@ -43,6 +79,10 @@ impl RenderBundleEncoder {
         self.raw.parent
     }
 
+    pub fn fill_commands(&mut self, commands: &[RenderCommand], offsets: &[wgt::DynamicOffset]) {
+        unsafe { self.raw.fill_render_commands(commands, offsets) }
+    }
+
     pub fn destroy(mut self) {
         unsafe { self.raw.invalidate() };
     }
@@ -51,46 +91,435 @@ impl RenderBundleEncoder {
 //Note: here, `RenderBundle` is just wrapping a raw stream of render commands.
 // The plan is to back it by an actual Vulkan secondary buffer, D3D12 Bundle,
 // or Metal indirect command buffer.
-//Note: there is no API tracing support for `RenderBundle` yet.
-// It's transparent with regards to the submitted render passes.
 #[derive(Debug)]
 pub struct RenderBundle {
-    pub(crate) device_ref_count: RefCount,
-    pub(crate) raw: RawPass<id::DeviceId>,
-    pub(crate) trackers: TrackerSet,
+    // Normalized command stream. It can be executed verbatim,
+    // without re-binding anything on the pipeline change.
+    commands: Vec<RenderCommand>,
+    dynamic_offsets: Vec<wgt::DynamicOffset>,
+    pub(crate) device_id: Stored<id::DeviceId>,
+    pub(crate) used: TrackerSet,
     pub(crate) context: RenderPassContext,
-    pub(crate) sample_count: u8,
     pub(crate) life_guard: LifeGuard,
 }
 
 unsafe impl Send for RenderBundle {}
 unsafe impl Sync for RenderBundle {}
 
+impl RenderBundle {
+    /// Actually encode the contents into a native command buffer.
+    ///
+    /// This is partially duplicating the logic of `command_encoder_run_render_pass`.
+    /// However the point of this function is to be lighter, since we already had
+    /// a chance to go through the commands in `render_bundle_encoder_finish`.
+    pub(crate) unsafe fn execute<B: GfxBackend>(
+        &self,
+        comb: &mut B::CommandBuffer,
+        pipeline_layout_guard: &Storage<
+            crate::binding_model::PipelineLayout<B>,
+            id::PipelineLayoutId,
+        >,
+        bind_group_guard: &Storage<crate::binding_model::BindGroup<B>, id::BindGroupId>,
+        pipeline_guard: &Storage<crate::pipeline::RenderPipeline<B>, id::RenderPipelineId>,
+        buffer_guard: &Storage<crate::resource::Buffer<B>, id::BufferId>,
+    ) {
+        use hal::command::CommandBuffer as _;
+
+        let mut offsets = self.dynamic_offsets.as_slice();
+        let mut index_type = hal::IndexType::U16;
+        let mut pipeline_layout_id = None::<id::PipelineLayoutId>;
+
+        for command in self.commands.iter() {
+            match *command {
+                RenderCommand::SetBindGroup {
+                    index,
+                    num_dynamic_offsets,
+                    bind_group_id,
+                    phantom_offsets: _,
+                } => {
+                    let bind_group = &bind_group_guard[bind_group_id];
+                    comb.bind_graphics_descriptor_sets(
+                        &pipeline_layout_guard[pipeline_layout_id.unwrap()].raw,
+                        index as usize,
+                        iter::once(bind_group.raw.raw()),
+                        &offsets[..num_dynamic_offsets as usize],
+                    );
+                    offsets = &offsets[num_dynamic_offsets as usize..];
+                }
+                RenderCommand::SetPipeline(pipeline_id) => {
+                    let pipeline = &pipeline_guard[pipeline_id];
+                    comb.bind_graphics_pipeline(&pipeline.raw);
+                    index_type = conv::map_index_format(pipeline.index_format);
+                    pipeline_layout_id = Some(pipeline.layout_id.value);
+                }
+                RenderCommand::SetIndexBuffer {
+                    buffer_id,
+                    offset,
+                    size,
+                } => {
+                    let buffer = &buffer_guard[buffer_id];
+                    let view = hal::buffer::IndexBufferView {
+                        buffer: &buffer.raw,
+                        range: hal::buffer::SubRange {
+                            offset,
+                            size: if size != wgt::BufferSize::WHOLE {
+                                Some(size.0)
+                            } else {
+                                None
+                            },
+                        },
+                        index_type,
+                    };
+
+                    comb.bind_index_buffer(view);
+                }
+                RenderCommand::SetVertexBuffer {
+                    slot,
+                    buffer_id,
+                    offset,
+                    size,
+                } => {
+                    let buffer = &buffer_guard[buffer_id];
+                    let range = hal::buffer::SubRange {
+                        offset,
+                        size: if size != wgt::BufferSize::WHOLE {
+                            Some(size.0)
+                        } else {
+                            None
+                        },
+                    };
+                    comb.bind_vertex_buffers(slot, iter::once((&buffer.raw, range)));
+                }
+                RenderCommand::Draw {
+                    vertex_count,
+                    instance_count,
+                    first_vertex,
+                    first_instance,
+                } => {
+                    comb.draw(
+                        first_vertex..first_vertex + vertex_count,
+                        first_instance..first_instance + instance_count,
+                    );
+                }
+                RenderCommand::DrawIndexed {
+                    index_count,
+                    instance_count,
+                    first_index,
+                    base_vertex,
+                    first_instance,
+                } => {
+                    comb.draw_indexed(
+                        first_index..first_index + index_count,
+                        base_vertex,
+                        first_instance..first_instance + instance_count,
+                    );
+                }
+                RenderCommand::DrawIndirect { buffer_id, offset } => {
+                    let buffer = &buffer_guard[buffer_id];
+                    comb.draw_indirect(&buffer.raw, offset, 1, 0);
+                }
+                RenderCommand::DrawIndexedIndirect { buffer_id, offset } => {
+                    let buffer = &buffer_guard[buffer_id];
+                    comb.draw_indexed_indirect(&buffer.raw, offset, 1, 0);
+                }
+                RenderCommand::ExecuteBundle(_)
+                | RenderCommand::SetBlendColor(_)
+                | RenderCommand::SetStencilReference(_)
+                | RenderCommand::SetViewport { .. }
+                | RenderCommand::SetScissor(_)
+                | RenderCommand::End => unreachable!(),
+            }
+        }
+    }
+}
+
+impl Borrow<RefCount> for RenderBundle {
+    fn borrow(&self) -> &RefCount {
+        self.life_guard.ref_count.as_ref().unwrap()
+    }
+}
+
+#[derive(Debug)]
+struct IndexState {
+    buffer: Option<id::BufferId>,
+    format: wgt::IndexFormat,
+    range: Range<wgt::BufferAddress>,
+    is_dirty: bool,
+}
+
+impl IndexState {
+    fn new() -> Self {
+        IndexState {
+            buffer: None,
+            format: wgt::IndexFormat::default(),
+            range: 0..0,
+            is_dirty: false,
+        }
+    }
+
+    fn limit(&self) -> u32 {
+        assert!(self.buffer.is_some());
+        let bytes_per_index = match self.format {
+            wgt::IndexFormat::Uint16 => 2,
+            wgt::IndexFormat::Uint32 => 4,
+        };
+        ((self.range.end - self.range.start) / bytes_per_index) as u32
+    }
+
+    fn flush(&mut self) -> Option<RenderCommand> {
+        if self.is_dirty {
+            self.is_dirty = false;
+            Some(RenderCommand::SetIndexBuffer {
+                buffer_id: self.buffer.unwrap(),
+                offset: self.range.start,
+                size: wgt::BufferSize(self.range.end - self.range.start),
+            })
+        } else {
+            None
+        }
+    }
+
+    fn set_format(&mut self, format: wgt::IndexFormat) {
+        if self.format != format {
+            self.format = format;
+            self.is_dirty = true;
+        }
+    }
+
+    fn set_buffer(&mut self, id: id::BufferId, range: Range<wgt::BufferAddress>) {
+        self.buffer = Some(id);
+        self.range = range;
+        self.is_dirty = true;
+    }
+}
+
+#[derive(Debug)]
+struct VertexState {
+    buffer: Option<id::BufferId>,
+    range: Range<wgt::BufferAddress>,
+    stride: wgt::BufferAddress,
+    rate: wgt::InputStepMode,
+    is_dirty: bool,
+}
+
+impl VertexState {
+    fn new() -> Self {
+        VertexState {
+            buffer: None,
+            range: 0..0,
+            stride: 0,
+            rate: wgt::InputStepMode::Vertex,
+            is_dirty: false,
+        }
+    }
+
+    fn set_buffer(&mut self, buffer_id: id::BufferId, range: Range<wgt::BufferAddress>) {
+        self.buffer = Some(buffer_id);
+        self.range = range;
+        self.is_dirty = true;
+    }
+
+    fn flush(&mut self, slot: u32) -> Option<RenderCommand> {
+        if self.is_dirty {
+            self.is_dirty = false;
+            Some(RenderCommand::SetVertexBuffer {
+                slot,
+                buffer_id: self.buffer.unwrap(),
+                offset: self.range.start,
+                size: wgt::BufferSize(self.range.end - self.range.start),
+            })
+        } else {
+            None
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BindState {
+    bind_group: Option<(id::BindGroupId, id::BindGroupLayoutId)>,
+    dynamic_offsets: Range<usize>,
+    is_dirty: bool,
+}
+
+impl BindState {
+    fn new() -> Self {
+        BindState {
+            bind_group: None,
+            dynamic_offsets: 0..0,
+            is_dirty: false,
+        }
+    }
+
+    fn set_group(
+        &mut self,
+        bind_group_id: id::BindGroupId,
+        layout_id: id::BindGroupLayoutId,
+        dyn_offset: usize,
+        dyn_count: usize,
+    ) -> bool {
+        match self.bind_group {
+            Some((bg_id, _)) if bg_id == bind_group_id && dyn_count == 0 => false,
+            _ => {
+                self.bind_group = Some((bind_group_id, layout_id));
+                self.dynamic_offsets = dyn_offset..dyn_offset + dyn_count;
+                self.is_dirty = true;
+                true
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct State {
+    trackers: TrackerSet,
+    index: IndexState,
+    vertex: ArrayVec<[VertexState; MAX_VERTEX_BUFFERS]>,
+    bind: ArrayVec<[BindState; MAX_BIND_GROUPS]>,
+    raw_dynamic_offsets: Vec<wgt::DynamicOffset>,
+    flat_dynamic_offsets: Vec<wgt::DynamicOffset>,
+    used_bind_groups: usize,
+}
+
+impl State {
+    fn vertex_limits(&self) -> (u32, u32) {
+        let mut vertex_limit = !0;
+        let mut instance_limit = !0;
+        for vbs in &self.vertex {
+            if vbs.stride == 0 {
+                continue;
+            }
+            let limit = ((vbs.range.end - vbs.range.start) / vbs.stride) as u32;
+            match vbs.rate {
+                wgt::InputStepMode::Vertex => vertex_limit = vertex_limit.min(limit),
+                wgt::InputStepMode::Instance => instance_limit = instance_limit.min(limit),
+            }
+        }
+        (vertex_limit, instance_limit)
+    }
+
+    fn invalidate_group_from(&mut self, slot: usize) {
+        for bind in self.bind[slot..].iter_mut() {
+            if bind.bind_group.is_some() {
+                bind.is_dirty = true;
+            }
+        }
+    }
+
+    fn set_bind_group(
+        &mut self,
+        slot: u8,
+        bind_group_id: id::BindGroupId,
+        layout_id: id::BindGroupLayoutId,
+        offsets: &[wgt::DynamicOffset],
+    ) {
+        if self.bind[slot as usize].set_group(
+            bind_group_id,
+            layout_id,
+            self.raw_dynamic_offsets.len(),
+            offsets.len(),
+        ) {
+            self.invalidate_group_from(slot as usize + 1);
+        }
+        self.raw_dynamic_offsets.extend(offsets);
+    }
+
+    fn set_pipeline(
+        &mut self,
+        index_format: wgt::IndexFormat,
+        vertex_strides: &[(wgt::BufferAddress, wgt::InputStepMode)],
+        layout_ids: &[Stored<id::BindGroupLayoutId>],
+    ) {
+        self.index.set_format(index_format);
+        for (vs, &(stride, step_mode)) in self.vertex.iter_mut().zip(vertex_strides) {
+            if vs.stride != stride || vs.rate != step_mode {
+                vs.stride = stride;
+                vs.rate = step_mode;
+                vs.is_dirty = true;
+            }
+        }
+        self.used_bind_groups = layout_ids.len();
+        let invalid_from = self
+            .bind
+            .iter()
+            .zip(layout_ids)
+            .position(|(bs, layout_id)| match bs.bind_group {
+                Some((_, bgl_id)) => bgl_id != layout_id.value,
+                None => false,
+            });
+        if let Some(slot) = invalid_from {
+            self.invalidate_group_from(slot);
+        }
+    }
+
+    fn flush_vertices(&mut self) -> impl Iterator<Item = RenderCommand> + '_ {
+        self.vertex
+            .iter_mut()
+            .enumerate()
+            .flat_map(|(i, vs)| vs.flush(i as u32))
+    }
+
+    fn flush_binds(&mut self) -> impl Iterator<Item = RenderCommand> + '_ {
+        for bs in self.bind[..self.used_bind_groups].iter() {
+            if bs.is_dirty {
+                self.flat_dynamic_offsets
+                    .extend_from_slice(&self.raw_dynamic_offsets[bs.dynamic_offsets.clone()]);
+            }
+        }
+        self.bind
+            .iter_mut()
+            .take(self.used_bind_groups)
+            .enumerate()
+            .flat_map(|(i, bs)| {
+                if bs.is_dirty {
+                    bs.is_dirty = false;
+                    Some(RenderCommand::SetBindGroup {
+                        index: i as u8,
+                        bind_group_id: bs.bind_group.unwrap().0,
+                        num_dynamic_offsets: (bs.dynamic_offsets.end - bs.dynamic_offsets.start)
+                            as u8,
+                        phantom_offsets: PhantomSlice::default(),
+                    })
+                } else {
+                    None
+                }
+            })
+    }
+}
+
 impl<G: GlobalIdentityHandlerFactory> Global<G> {
     pub fn render_bundle_encoder_finish<B: GfxBackend>(
         &self,
-        mut bundle_encoder: RenderBundleEncoder,
-        _desc: &wgt::RenderBundleDescriptor<Label>, //TODO
+        bundle_encoder: RenderBundleEncoder,
+        desc: &wgt::RenderBundleDescriptor<Label>,
         id_in: Input<G, id::RenderBundleId>,
     ) -> id::RenderBundleId {
         let hub = B::hub(self);
         let mut token = Token::root();
         let (device_guard, mut token) = hub.devices.read(&mut token);
 
-        unsafe { bundle_encoder.raw.finish(&RenderCommand::ResetBundleState) };
-
+        let (data, device_id) = unsafe { bundle_encoder.raw.finish_render() };
+        let device = &device_guard[device_id];
         let render_bundle = {
+            let (pipeline_layout_guard, mut token) = hub.pipeline_layouts.read(&mut token);
             let (bind_group_guard, mut token) = hub.bind_groups.read(&mut token);
             let (pipeline_guard, mut token) = hub.render_pipelines.read(&mut token);
             let (buffer_guard, _) = hub.buffers.read(&mut token);
 
-            let mut trackers = TrackerSet::new(bundle_encoder.parent().backend());
+            let mut state = State {
+                trackers: TrackerSet::new(device_id.backend()),
+                index: IndexState::new(),
+                vertex: (0..MAX_VERTEX_BUFFERS)
+                    .map(|_| VertexState::new())
+                    .collect(),
+                bind: (0..MAX_BIND_GROUPS).map(|_| BindState::new()).collect(),
+                raw_dynamic_offsets: Vec::new(),
+                flat_dynamic_offsets: Vec::new(),
+                used_bind_groups: 0,
+            };
+            let mut commands = Vec::new();
 
-            // populate the trackers and validate the commands
-            #[allow(trivial_casts)] // erroneous warning!
-            let mut peeker = bundle_encoder.raw.base as *const u8;
-            #[allow(trivial_casts)] // erroneous warning!
-            let raw_data_end = bundle_encoder.raw.data as *const _;
+            let mut peeker = data.as_ptr();
+            let raw_data_end = unsafe { data.as_ptr().offset(data.len() as isize) };
             let mut command = RenderCommand::Draw {
                 vertex_count: 0,
                 instance_count: 0,
@@ -107,7 +536,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
 
                 match command {
                     RenderCommand::SetBindGroup {
-                        index: _,
+                        index,
                         num_dynamic_offsets,
                         bind_group_id,
                         phantom_offsets,
@@ -120,96 +549,235 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                             )
                         };
                         peeker = new_peeker;
-
-                        if cfg!(debug_assertions) {
-                            for off in offsets {
-                                assert_eq!(
-                                    *off as wgt::BufferAddress % wgt::BIND_BUFFER_ALIGNMENT,
-                                    0,
-                                    "Misaligned dynamic buffer offset: {} does not align with {}",
-                                    off,
-                                    wgt::BIND_BUFFER_ALIGNMENT
-                                );
-                            }
+                        for off in offsets {
+                            assert_eq!(
+                                *off as wgt::BufferAddress % wgt::BIND_BUFFER_ALIGNMENT,
+                                0,
+                                "Misaligned dynamic buffer offset: {} does not align with {}",
+                                off,
+                                wgt::BIND_BUFFER_ALIGNMENT
+                            );
                         }
 
-                        let bind_group = trackers
+                        let bind_group = state
+                            .trackers
                             .bind_groups
                             .use_extend(&*bind_group_guard, bind_group_id, (), ())
                             .unwrap();
                         assert_eq!(bind_group.dynamic_count, offsets.len());
 
-                        trackers.merge_extend(&bind_group.used);
+                        state.set_bind_group(index, bind_group_id, bind_group.layout_id, offsets);
+                        state.trackers.merge_extend(&bind_group.used);
                     }
                     RenderCommand::SetPipeline(pipeline_id) => {
-                        let pipeline = trackers
+                        let pipeline = state
+                            .trackers
                             .render_pipes
                             .use_extend(&*pipeline_guard, pipeline_id, (), ())
                             .unwrap();
 
                         assert!(
                             bundle_encoder.context.compatible(&pipeline.pass_context),
-                            "The render pipeline output formats do not match render pass attachment formats!"
-                        );
-                        assert_eq!(
-                            pipeline.sample_count, bundle_encoder.sample_count,
-                            "The render pipeline and renderpass have mismatching sample_count"
+                            "The render pipeline output formats and sample counts do not match render pass attachment formats!"
                         );
                         //TODO: check read-only depth
+
+                        let layout = &pipeline_layout_guard[pipeline.layout_id.value];
+
+                        state.set_pipeline(
+                            pipeline.index_format,
+                            &pipeline.vertex_strides,
+                            &layout.bind_group_layout_ids,
+                        );
+                        commands.push(command);
                     }
-                    RenderCommand::SetIndexBuffer { buffer_id, .. } => {
-                        let buffer = trackers
+                    RenderCommand::SetIndexBuffer {
+                        buffer_id,
+                        offset,
+                        size,
+                    } => {
+                        let buffer = state
+                            .trackers
                             .buffers
                             .use_extend(&*buffer_guard, buffer_id, (), BufferUse::INDEX)
                             .unwrap();
                         assert!(buffer.usage.contains(wgt::BufferUsage::INDEX), "An invalid setIndexBuffer call has been made. The buffer usage is {:?} which does not contain required usage INDEX", buffer.usage);
+
+                        let end = if size != wgt::BufferSize::WHOLE {
+                            offset + size.0
+                        } else {
+                            buffer.size
+                        };
+                        state.index.set_buffer(buffer_id, offset..end);
                     }
-                    RenderCommand::SetVertexBuffer { buffer_id, .. } => {
-                        let buffer = trackers
+                    RenderCommand::SetVertexBuffer {
+                        slot,
+                        buffer_id,
+                        offset,
+                        size,
+                    } => {
+                        let buffer = state
+                            .trackers
                             .buffers
                             .use_extend(&*buffer_guard, buffer_id, (), BufferUse::VERTEX)
                             .unwrap();
-                        assert!(buffer.usage.contains(wgt::BufferUsage::VERTEX), "An invalid setVertexBuffer call has been made. The buffer usage is {:?} which does not contain required usage VERTEX", buffer.usage);
+                        assert!(
+                            buffer.usage.contains(wgt::BufferUsage::VERTEX),
+                            "An invalid setVertexBuffer call has been made. The buffer usage is {:?} which does not contain required usage VERTEX",
+                            buffer.usage
+                        );
+
+                        let end = if size != wgt::BufferSize::WHOLE {
+                            offset + size.0
+                        } else {
+                            buffer.size
+                        };
+                        state.vertex[slot as usize].set_buffer(buffer_id, offset..end);
+                    }
+                    RenderCommand::Draw {
+                        vertex_count,
+                        instance_count,
+                        first_vertex,
+                        first_instance,
+                    } => {
+                        let (vertex_limit, instance_limit) = state.vertex_limits();
+                        assert!(
+                            first_vertex + vertex_count <= vertex_limit,
+                            "Vertex {} extends beyond limit {}",
+                            first_vertex + vertex_count,
+                            vertex_limit
+                        );
+                        assert!(
+                            first_instance + instance_count <= instance_limit,
+                            "Instance {} extends beyond limit {}",
+                            first_instance + instance_count,
+                            instance_limit
+                        );
+                        commands.extend(state.flush_vertices());
+                        commands.extend(state.flush_binds());
+                        commands.push(command);
+                    }
+                    RenderCommand::DrawIndexed {
+                        index_count,
+                        instance_count,
+                        first_index,
+                        base_vertex: _,
+                        first_instance,
+                    } => {
+                        //TODO: validate that base_vertex + max_index() is within the provided range
+                        let (_, instance_limit) = state.vertex_limits();
+                        let index_limit = state.index.limit();
+                        assert!(
+                            first_index + index_count <= index_limit,
+                            "Index {} extends beyond limit {}",
+                            first_index + index_count,
+                            index_limit
+                        );
+                        assert!(
+                            first_instance + instance_count <= instance_limit,
+                            "Instance {} extends beyond limit {}",
+                            first_instance + instance_count,
+                            instance_limit
+                        );
+                        commands.extend(state.index.flush());
+                        commands.extend(state.flush_vertices());
+                        commands.extend(state.flush_binds());
+                        commands.push(command);
                     }
-                    RenderCommand::Draw { .. } | RenderCommand::DrawIndexed { .. } => {}
                     RenderCommand::DrawIndirect {
                         buffer_id,
                         offset: _,
-                    }
-                    | RenderCommand::DrawIndexedIndirect {
-                        buffer_id,
-                        offset: _,
                     } => {
-                        let buffer = trackers
+                        let buffer = state
+                            .trackers
                             .buffers
                             .use_extend(&*buffer_guard, buffer_id, (), BufferUse::INDIRECT)
                             .unwrap();
-                        assert!(buffer.usage.contains(wgt::BufferUsage::INDIRECT), "An invalid draw(Indexed)Indirect call has been made. The buffer usage is {:?} which does not contain required usage INDIRECT", buffer.usage);
+                        assert!(
+                            buffer.usage.contains(wgt::BufferUsage::INDIRECT),
+                            "An invalid drawIndirect call has been made. The buffer usage is {:?} which does not contain required usage INDIRECT",
+                            buffer.usage
+                        );
+
+                        commands.extend(state.flush_vertices());
+                        commands.extend(state.flush_binds());
+                        commands.push(command);
                     }
-                    RenderCommand::ResetBundleState => break,
-                    RenderCommand::SetBlendColor(_)
+                    RenderCommand::DrawIndexedIndirect {
+                        buffer_id,
+                        offset: _,
+                    } => {
+                        let buffer = state
+                            .trackers
+                            .buffers
+                            .use_extend(&*buffer_guard, buffer_id, (), BufferUse::INDIRECT)
+                            .unwrap();
+                        assert!(
+                            buffer.usage.contains(wgt::BufferUsage::INDIRECT),
+                            "An invalid drawIndexedIndirect call has been made. The buffer usage is {:?} which does not contain required usage INDIRECT",
+                            buffer.usage
+                        );
+
+                        commands.extend(state.index.flush());
+                        commands.extend(state.flush_vertices());
+                        commands.extend(state.flush_binds());
+                        commands.push(command);
+                    }
+                    RenderCommand::End => break,
+                    RenderCommand::ExecuteBundle(_)
+                    | RenderCommand::SetBlendColor(_)
                     | RenderCommand::SetStencilReference(_)
                     | RenderCommand::SetViewport { .. }
-                    | RenderCommand::SetScissor(_)
-                    | RenderCommand::End => unreachable!("not support by a render bundle"),
+                    | RenderCommand::SetScissor(_) => {
+                        unreachable!("not supported by a render bundle")
+                    }
                 }
             }
 
-            log::debug!("Render bundle {:#?}", trackers);
-            //TODO: check if the device is still alive
-            let device = &device_guard[bundle_encoder.raw.parent];
+            log::debug!("Render bundle {:?} = {:#?}", id_in, state.trackers);
+            let _ = desc.label; //TODO: actually use
+                                //TODO: check if the device is still alive
             RenderBundle {
-                device_ref_count: device.life_guard.add_ref(),
-                raw: bundle_encoder.raw,
-                trackers,
+                commands,
+                dynamic_offsets: state.flat_dynamic_offsets,
+                device_id: Stored {
+                    value: device_id,
+                    ref_count: device.life_guard.add_ref(),
+                },
+                used: state.trackers,
                 context: bundle_encoder.context,
-                sample_count: bundle_encoder.sample_count,
                 life_guard: LifeGuard::new(),
             }
         };
 
-        hub.render_bundles
-            .register_identity(id_in, render_bundle, &mut token)
+        let ref_count = render_bundle.life_guard.add_ref();
+        let id = hub
+            .render_bundles
+            .register_identity(id_in, render_bundle, &mut token);
+
+        #[cfg(feature = "trace")]
+        match device.trace {
+            Some(ref trace) => {
+                use crate::device::trace;
+                let (bundle_guard, _) = hub.render_bundles.read(&mut token);
+                let bundle = &bundle_guard[id];
+                trace.lock().add(trace::Action::CreateRenderBundle {
+                    id,
+                    desc: trace::RenderBundleDescriptor::new(desc.label, &bundle.context),
+                    commands: bundle.commands.clone(),
+                    dynamic_offsets: bundle.dynamic_offsets.clone(),
+                });
+            }
+            None => {}
+        }
+
+        device
+            .trackers
+            .lock()
+            .bundles
+            .init(id, ref_count, PhantomData)
+            .unwrap();
+        id
     }
 }
 
diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs
index 1f742c6e6..57066239c 100644
--- a/wgpu-core/src/command/compute.rs
+++ b/wgpu-core/src/command/compute.rs
@@ -53,7 +53,25 @@ impl Default for ComputeCommand {
 
 impl super::RawPass<id::CommandEncoderId> {
     pub unsafe fn new_compute(parent: id::CommandEncoderId) -> Self {
-        Self::from_vec(Vec::<ComputeCommand>::with_capacity(1), parent)
+        Self::new::<ComputeCommand>(parent)
+    }
+
+    pub unsafe fn fill_compute_commands(
+        &mut self,
+        commands: &[ComputeCommand],
+        mut offsets: &[DynamicOffset],
+    ) {
+        for com in commands {
+            self.encode(com);
+            if let ComputeCommand::SetBindGroup {
+                num_dynamic_offsets,
+                ..
+            } = *com
+            {
+                self.encode_slice(&offsets[..num_dynamic_offsets as usize]);
+                offsets = &offsets[num_dynamic_offsets as usize..];
+            }
+        }
     }
 
     pub unsafe fn finish_compute(mut self) -> (Vec<u8>, id::CommandEncoderId) {
@@ -84,6 +102,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let raw = cmb.raw.last_mut().unwrap();
         let mut binder = Binder::new(cmb.limits.max_bind_groups);
 
+        let (_, mut token) = hub.render_bundles.read(&mut token);
         let (pipeline_layout_guard, mut token) = hub.pipeline_layouts.read(&mut token);
         let (bind_group_guard, mut token) = hub.bind_groups.read(&mut token);
         let (pipeline_guard, mut token) = hub.compute_pipelines.read(&mut token);
diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs
index 27c76f193..8e2aa2730 100644
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -68,9 +68,10 @@ pub struct RawPass<P> {
 }
 
 impl<P: Copy> RawPass<P> {
-    fn from_vec<T>(mut vec: Vec<T>, parent: P) -> Self {
+    fn new<T>(parent: P) -> Self {
+        let mut vec = Vec::<T>::with_capacity(1);
         let ptr = vec.as_mut_ptr() as *mut u8;
-        let capacity = vec.capacity() * mem::size_of::<T>();
+        let capacity = mem::size_of::<T>();
         assert_ne!(capacity, 0);
         mem::forget(vec);
         RawPass {
@@ -135,13 +136,13 @@ impl<P: Copy> RawPass<P> {
     }
 
     #[inline]
-    pub unsafe fn encode<C: peek_poke::Poke>(&mut self, command: &C) {
+    pub(crate) unsafe fn encode<C: peek_poke::Poke>(&mut self, command: &C) {
         self.ensure_extra_size(C::max_size());
         self.data = command.poke_into(self.data);
     }
 
     #[inline]
-    pub unsafe fn encode_slice<T: Copy>(&mut self, data: &[T]) {
+    pub(crate) unsafe fn encode_slice<T: Copy>(&mut self, data: &[T]) {
         let align_offset = self.data.align_offset(mem::align_of::<T>());
         let extra = align_offset + mem::size_of::<T>() * data.len();
         self.ensure_extra_size(extra);
@@ -193,6 +194,7 @@ impl<B: GfxBackend> CommandBuffer<B> {
             .merge_extend(&head.compute_pipes)
             .unwrap();
         base.render_pipes.merge_extend(&head.render_pipes).unwrap();
+        base.bundles.merge_extend(&head.bundles).unwrap();
 
         let stages = all_buffer_stages() | all_image_stages();
         unsafe {
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index 607e7b744..6c7e32ecd 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -10,7 +10,8 @@ use crate::{
     },
     conv,
     device::{
-        FramebufferKey, RenderPassContext, RenderPassKey, MAX_COLOR_TARGETS, MAX_VERTEX_BUFFERS,
+        AttachmentData, FramebufferKey, RenderPassContext, RenderPassKey, MAX_COLOR_TARGETS,
+        MAX_VERTEX_BUFFERS,
     },
     hub::{GfxBackend, Global, GlobalIdentityHandlerFactory, Token},
     id,
@@ -23,7 +24,6 @@ use crate::{
 use arrayvec::ArrayVec;
 use hal::command::CommandBuffer as _;
 use peek_poke::{Peek, PeekPoke, Poke};
-use smallvec::SmallVec;
 use wgt::{
     BufferAddress, BufferSize, BufferUsage, Color, DynamicOffset, IndexFormat, InputStepMode,
     LoadOp, RenderPassColorAttachmentDescriptorBase,
@@ -116,9 +116,7 @@ pub enum RenderCommand {
         buffer_id: id::BufferId,
         offset: BufferAddress,
     },
-    /// Resets all the state that can be set by a render bundle.
-    /// Also has to be the last command in a render bundle.
-    ResetBundleState,
+    ExecuteBundle(id::RenderBundleId),
     End,
 }
 
@@ -131,7 +129,7 @@ impl Default for RenderCommand {
 
 impl RawPass<id::CommandEncoderId> {
     pub unsafe fn new_render(parent_id: id::CommandEncoderId, desc: &RenderPassDescriptor) -> Self {
-        let mut pass = Self::from_vec(Vec::<RenderCommand>::with_capacity(1), parent_id);
+        let mut pass = Self::new::<RenderCommand>(parent_id);
 
         let mut targets: RawRenderTargets = mem::zeroed();
         if let Some(ds) = desc.depth_stencil_attachment {
@@ -171,8 +169,28 @@ impl RawPass<id::CommandEncoderId> {
         pass.encode(&targets);
         pass
     }
+}
 
-    pub unsafe fn finish_render(mut self) -> (Vec<u8>, id::CommandEncoderId) {
+impl<P: Copy> RawPass<P> {
+    pub unsafe fn fill_render_commands(
+        &mut self,
+        commands: &[RenderCommand],
+        mut offsets: &[DynamicOffset],
+    ) {
+        for com in commands {
+            self.encode(com);
+            if let RenderCommand::SetBindGroup {
+                num_dynamic_offsets,
+                ..
+            } = *com
+            {
+                self.encode_slice(&offsets[..num_dynamic_offsets as usize]);
+                offsets = &offsets[num_dynamic_offsets as usize..];
+            }
+        }
+    }
+
+    pub unsafe fn finish_render(mut self) -> (Vec<u8>, P) {
         self.finish(RenderCommand::End);
         self.into_vec()
     }
@@ -216,8 +234,8 @@ impl fmt::Debug for DrawError {
     }
 }
 
-#[derive(Debug)]
-pub struct IndexState {
+#[derive(Debug, Default)]
+struct IndexState {
     bound_buffer_view: Option<(id::BufferId, Range<BufferAddress>)>,
     format: IndexFormat,
     limit: u32,
@@ -244,7 +262,7 @@ impl IndexState {
 }
 
 #[derive(Clone, Copy, Debug)]
-pub struct VertexBufferState {
+struct VertexBufferState {
     total_size: BufferAddress,
     stride: BufferAddress,
     rate: InputStepMode,
@@ -258,9 +276,9 @@ impl VertexBufferState {
     };
 }
 
-#[derive(Debug)]
-pub struct VertexState {
-    inputs: SmallVec<[VertexBufferState; MAX_VERTEX_BUFFERS]>,
+#[derive(Debug, Default)]
+struct VertexState {
+    inputs: ArrayVec<[VertexBufferState; MAX_VERTEX_BUFFERS]>,
     vertex_limit: u32,
     instance_limit: u32,
 }
@@ -352,6 +370,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             raw.begin_primary(hal::command::CommandBufferFlags::ONE_TIME_SUBMIT);
         }
 
+        let (bundle_guard, mut token) = hub.render_bundles.read(&mut token);
         let (pipeline_layout_guard, mut token) = hub.pipeline_layouts.read(&mut token);
         let (bind_group_guard, mut token) = hub.bind_groups.read(&mut token);
         let (pipeline_guard, mut token) = hub.render_pipelines.read(&mut token);
@@ -408,7 +427,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         // instead of the special read-only one, which would be `None`.
         let mut is_ds_read_only = false;
 
-        let (context, sample_count) = {
+        let context = {
             use hal::device::Device as _;
 
             let samples_count_limit = device.hal_limits.framebuffer_color_sample_counts;
@@ -874,19 +893,22 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 );
             }
 
-            let context = RenderPassContext {
-                colors: color_attachments
-                    .iter()
-                    .map(|at| view_guard[at.attachment].format)
-                    .collect(),
-                resolves: color_attachments
-                    .iter()
-                    .filter_map(|at| at.resolve_target)
-                    .map(|resolve| view_guard[resolve].format)
-                    .collect(),
-                depth_stencil: depth_stencil_attachment.map(|at| view_guard[at.attachment].format),
-            };
-            (context, sample_count)
+            RenderPassContext {
+                attachments: AttachmentData {
+                    colors: color_attachments
+                        .iter()
+                        .map(|at| view_guard[at.attachment].format)
+                        .collect(),
+                    resolves: color_attachments
+                        .iter()
+                        .filter_map(|at| at.resolve_target)
+                        .map(|resolve| view_guard[resolve].format)
+                        .collect(),
+                    depth_stencil: depth_stencil_attachment
+                        .map(|at| view_guard[at.attachment].format),
+                },
+                sample_count,
+            }
         };
 
         let mut state = State {
@@ -894,16 +916,8 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             blend_color: OptionalState::Unused,
             stencil_reference: OptionalState::Unused,
             pipeline: OptionalState::Required,
-            index: IndexState {
-                bound_buffer_view: None,
-                format: IndexFormat::Uint16,
-                limit: 0,
-            },
-            vertex: VertexState {
-                inputs: SmallVec::new(),
-                vertex_limit: 0,
-                instance_limit: 0,
-            },
+            index: IndexState::default(),
+            vertex: VertexState::default(),
         };
 
         let mut command = RenderCommand::Draw {
@@ -969,7 +983,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                         );
                         unsafe {
                             raw.bind_graphics_descriptor_sets(
-                                &&pipeline_layout_guard[pipeline_layout_id].raw,
+                                &pipeline_layout_guard[pipeline_layout_id].raw,
                                 index as usize,
                                 bind_groups,
                                 offsets
@@ -989,11 +1003,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
 
                     assert!(
                         context.compatible(&pipeline.pass_context),
-                        "The render pipeline output formats do not match render pass attachment formats!"
-                    );
-                    assert_eq!(
-                        pipeline.sample_count, sample_count,
-                        "The render pipeline and renderpass have mismatching sample_count"
+                        "The render pipeline output formats and sample count do not match render pass attachment formats!"
                     );
                     assert!(
                         !is_ds_read_only || pipeline.flags.contains(PipelineFlags::DEPTH_STENCIL_READ_ONLY),
@@ -1284,7 +1294,28 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                         raw.draw_indexed_indirect(&buffer.raw, offset, 1, 0);
                     }
                 }
-                RenderCommand::ResetBundleState => {
+                RenderCommand::ExecuteBundle(bundle_id) => {
+                    let bundle = trackers
+                        .bundles
+                        .use_extend(&*bundle_guard, bundle_id, (), ())
+                        .unwrap();
+
+                    assert!(
+                        context.compatible(&bundle.context),
+                        "The render bundle output formats do not match render pass attachment formats!"
+                    );
+
+                    unsafe {
+                        bundle.execute(
+                            &mut raw,
+                            &*pipeline_layout_guard,
+                            &*bind_group_guard,
+                            &*pipeline_guard,
+                            &*buffer_guard,
+                        )
+                    };
+
+                    trackers.merge_extend(&bundle.used);
                     state.reset_bundle();
                 }
                 RenderCommand::End => break,
@@ -1346,31 +1377,6 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         }
         cmb.raw.push(raw);
     }
-
-    pub fn wgpu_render_pass_execute_bundles<B: GfxBackend>(
-        &self,
-        pass: &mut RawPass<id::CommandEncoderId>,
-        render_bundle_ids: &[id::RenderBundleId],
-    ) {
-        let hub = B::hub(self);
-        let mut token = Token::root();
-
-        unsafe { pass.encode(&RenderCommand::ResetBundleState) };
-
-        let (_, mut token) = hub.devices.read(&mut token);
-        let (bundle_guard, _) = hub.render_bundles.read(&mut token);
-        for &bundle_id in render_bundle_ids {
-            let bundle = &bundle_guard[bundle_id];
-            //TODO: check the `bundle.context`? It will be checked
-            // when the render pass finishes.
-            let size = bundle.raw.size();
-            unsafe {
-                pass.ensure_extra_size(size);
-                std::ptr::copy_nonoverlapping(bundle.raw.base, pass.data, size);
-                pass.data = pass.data.offset(size as isize);
-            }
-        }
-    }
 }
 
 pub mod render_ffi {
@@ -1553,6 +1559,17 @@ pub mod render_ffi {
         //TODO
     }
 
+    #[no_mangle]
+    pub unsafe fn wgpu_render_pass_execute_bundles(
+        pass: &mut RawPass,
+        render_bundle_ids: *const id::RenderBundleId,
+        render_bundle_ids_length: usize,
+    ) {
+        for &bundle_id in slice::from_raw_parts(render_bundle_ids, render_bundle_ids_length) {
+            pass.encode(&RenderCommand::ExecuteBundle(bundle_id));
+        }
+    }
+
     #[no_mangle]
     pub unsafe extern "C" fn wgpu_render_pass_finish(
         pass: &mut RawPass,
diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs
index d8da38d92..7d20209ae 100644
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -33,10 +33,11 @@ pub struct SuspectedResources {
     pub(crate) render_pipelines: Vec<id::RenderPipelineId>,
     pub(crate) bind_group_layouts: Vec<Stored<id::BindGroupLayoutId>>,
     pub(crate) pipeline_layouts: Vec<Stored<id::PipelineLayoutId>>,
+    pub(crate) render_bundles: Vec<id::RenderBundleId>,
 }
 
 impl SuspectedResources {
-    pub fn clear(&mut self) {
+    pub(crate) fn clear(&mut self) {
         self.buffers.clear();
         self.textures.clear();
         self.texture_views.clear();
@@ -46,9 +47,10 @@ impl SuspectedResources {
         self.render_pipelines.clear();
         self.bind_group_layouts.clear();
         self.pipeline_layouts.clear();
+        self.render_bundles.clear();
     }
 
-    pub fn extend(&mut self, other: &Self) {
+    pub(crate) fn extend(&mut self, other: &Self) {
         self.buffers.extend_from_slice(&other.buffers);
         self.textures.extend_from_slice(&other.textures);
         self.texture_views.extend_from_slice(&other.texture_views);
@@ -62,6 +64,18 @@ impl SuspectedResources {
             .extend_from_slice(&other.bind_group_layouts);
         self.pipeline_layouts
             .extend_from_slice(&other.pipeline_layouts);
+        self.render_bundles.extend_from_slice(&other.render_bundles);
+    }
+
+    pub(crate) fn add_trackers(&mut self, trackers: &TrackerSet) {
+        self.buffers.extend(trackers.buffers.used());
+        self.textures.extend(trackers.textures.used());
+        self.texture_views.extend(trackers.views.used());
+        self.samplers.extend(trackers.samplers.used());
+        self.bind_groups.extend(trackers.bind_groups.used());
+        self.compute_pipelines.extend(trackers.compute_pipes.used());
+        self.render_pipelines.extend(trackers.render_pipes.used());
+        self.render_bundles.extend(trackers.bundles.used());
     }
 }
 
@@ -308,30 +322,33 @@ impl<B: GfxBackend> LifetimeTracker<B> {
         #[cfg(feature = "trace")] trace: Option<&Mutex<trace::Trace>>,
         token: &mut Token<super::Device<B>>,
     ) {
+        if !self.suspected_resources.render_bundles.is_empty() {
+            let mut trackers = trackers.lock();
+            let (mut guard, _) = hub.render_bundles.write(token);
+
+            while let Some(id) = self.suspected_resources.render_bundles.pop() {
+                if trackers.bundles.remove_abandoned(id) {
+                    #[cfg(feature = "trace")]
+                    trace.map(|t| t.lock().add(trace::Action::DestroyRenderBundle(id)));
+                    hub.render_bundles.free_id(id);
+                    let res = guard.remove(id).unwrap();
+                    self.suspected_resources.add_trackers(&res.used);
+                }
+            }
+        }
+
         if !self.suspected_resources.bind_groups.is_empty() {
             let mut trackers = trackers.lock();
             let (mut guard, _) = hub.bind_groups.write(token);
 
-            for id in self.suspected_resources.bind_groups.drain(..) {
+            while let Some(id) = self.suspected_resources.bind_groups.pop() {
                 if trackers.bind_groups.remove_abandoned(id) {
                     #[cfg(feature = "trace")]
                     trace.map(|t| t.lock().add(trace::Action::DestroyBindGroup(id)));
                     hub.bind_groups.free_id(id);
                     let res = guard.remove(id).unwrap();
 
-                    assert!(res.used.bind_groups.is_empty());
-                    self.suspected_resources
-                        .buffers
-                        .extend(res.used.buffers.used());
-                    self.suspected_resources
-                        .textures
-                        .extend(res.used.textures.used());
-                    self.suspected_resources
-                        .texture_views
-                        .extend(res.used.views.used());
-                    self.suspected_resources
-                        .samplers
-                        .extend(res.used.samplers.used());
+                    self.suspected_resources.add_trackers(&res.used);
 
                     let submit_index = res.life_guard.submission_index.load(Ordering::Acquire);
                     self.active
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index e0da94f68..8675da744 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -101,17 +101,24 @@ impl<T> AttachmentData<T> {
     }
 }
 
+pub(crate) type RenderPassKey = AttachmentData<(hal::pass::Attachment, hal::image::Layout)>;
+pub(crate) type FramebufferKey = AttachmentData<id::TextureViewId>;
+
+#[derive(Clone, Debug, Hash, PartialEq)]
+pub(crate) struct RenderPassContext {
+    pub attachments: AttachmentData<TextureFormat>,
+    pub sample_count: u8,
+}
+
 impl RenderPassContext {
     // Assumed the renderpass only contains one subpass
     pub(crate) fn compatible(&self, other: &RenderPassContext) -> bool {
-        self.colors == other.colors && self.depth_stencil == other.depth_stencil
+        self.attachments.colors == other.attachments.colors
+            && self.attachments.depth_stencil == other.attachments.depth_stencil
+            && self.sample_count == other.sample_count
     }
 }
 
-pub(crate) type RenderPassKey = AttachmentData<(hal::pass::Attachment, hal::image::Layout)>;
-pub(crate) type FramebufferKey = AttachmentData<id::TextureViewId>;
-pub(crate) type RenderPassContext = AttachmentData<TextureFormat>;
-
 type BufferMapResult = Result<ptr::NonNull<u8>, hal::device::MapError>;
 type BufferMapPendingCallback = (resource::BufferMapOperation, resource::BufferMapAsyncStatus);
 
@@ -1860,7 +1867,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         Box::into_raw(Box::new(encoder))
     }
 
-    pub fn render_bundle_encoder_destroy<B: GfxBackend>(
+    pub fn render_bundle_encoder_destroy(
         &self,
         render_bundle_encoder: command::RenderBundleEncoder,
     ) {
@@ -1871,14 +1878,19 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let hub = B::hub(self);
         let mut token = Token::root();
 
-        let (mut device_guard, mut token) = hub.devices.write(&mut token);
-        let mut bundle = {
-            let (mut render_bundle_guard, _) = hub.render_bundles.write(&mut token);
-            render_bundle_guard.remove(render_bundle_id).unwrap()
+        let (device_guard, mut token) = hub.devices.read(&mut token);
+        let device_id = {
+            let (mut bundle_guard, _) = hub.render_bundles.write(&mut token);
+            let bundle = &mut bundle_guard[render_bundle_id];
+            bundle.life_guard.ref_count.take();
+            bundle.device_id.value
         };
 
-        let (_, device_id) = unsafe { bundle.raw.invalidate() };
-        device_guard[device_id].untrack(&hub, &bundle.trackers, &mut token);
+        device_guard[device_id]
+            .lock_life(&mut token)
+            .suspected_resources
+            .render_bundles
+            .push(render_bundle_id);
     }
 
     pub fn device_create_render_pipeline<B: GfxBackend>(
@@ -2177,9 +2189,12 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         };
 
         let pass_context = RenderPassContext {
-            colors: color_states.iter().map(|state| state.format).collect(),
-            resolves: ArrayVec::new(),
-            depth_stencil: depth_stencil_state.map(|state| state.format),
+            attachments: AttachmentData {
+                colors: color_states.iter().map(|state| state.format).collect(),
+                resolves: ArrayVec::new(),
+                depth_stencil: depth_stencil_state.map(|state| state.format),
+            },
+            sample_count: samples,
         };
 
         let mut flags = pipeline::PipelineFlags::empty();
@@ -2211,7 +2226,6 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             flags,
             index_format: desc.vertex_state.index_format,
             vertex_strides,
-            sample_count: samples,
             life_guard: LifeGuard::new(),
         };
 
diff --git a/wgpu-core/src/device/trace.rs b/wgpu-core/src/device/trace.rs
index e45202ae0..0d2ece6e1 100644
--- a/wgpu-core/src/device/trace.rs
+++ b/wgpu-core/src/device/trace.rs
@@ -2,10 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-use crate::{
-    command::{BufferCopyView, TextureCopyView},
-    id,
-};
+use crate::id;
 #[cfg(feature = "trace")]
 use std::io::Write as _;
 use std::ops::Range;
@@ -92,6 +89,28 @@ pub struct RenderPipelineDescriptor {
     pub alpha_to_coverage_enabled: bool,
 }
 
+#[derive(Debug)]
+#[cfg_attr(feature = "trace", derive(serde::Serialize))]
+#[cfg_attr(feature = "replay", derive(serde::Deserialize))]
+pub struct RenderBundleDescriptor {
+    pub label: String,
+    pub color_formats: Vec<wgt::TextureFormat>,
+    pub depth_stencil_format: Option<wgt::TextureFormat>,
+    pub sample_count: u32,
+}
+
+#[cfg(feature = "trace")]
+impl RenderBundleDescriptor {
+    pub(crate) fn new(label: super::Label, context: &super::RenderPassContext) -> Self {
+        RenderBundleDescriptor {
+            label: super::own_label(&label),
+            color_formats: context.attachments.colors.to_vec(),
+            depth_stencil_format: context.attachments.depth_stencil,
+            sample_count: context.sample_count as u32,
+        }
+    }
+}
+
 #[derive(Debug)]
 #[cfg_attr(feature = "trace", derive(serde::Serialize))]
 #[cfg_attr(feature = "replay", derive(serde::Deserialize))]
@@ -163,6 +182,13 @@ pub enum Action {
         desc: RenderPipelineDescriptor,
     },
     DestroyRenderPipeline(id::RenderPipelineId),
+    CreateRenderBundle {
+        id: id::RenderBundleId,
+        desc: RenderBundleDescriptor,
+        commands: Vec<crate::command::RenderCommand>,
+        dynamic_offsets: Vec<wgt::DynamicOffset>,
+    },
+    DestroyRenderBundle(id::RenderBundleId),
     WriteBuffer {
         id: id::BufferId,
         data: FileName,
@@ -170,7 +196,7 @@ pub enum Action {
         queued: bool,
     },
     WriteTexture {
-        to: TextureCopyView,
+        to: crate::command::TextureCopyView,
         data: FileName,
         layout: wgt::TextureDataLayout,
         size: wgt::Extent3d,
@@ -190,18 +216,18 @@ pub enum Command {
         size: wgt::BufferAddress,
     },
     CopyBufferToTexture {
-        src: BufferCopyView,
-        dst: TextureCopyView,
+        src: crate::command::BufferCopyView,
+        dst: crate::command::TextureCopyView,
         size: wgt::Extent3d,
     },
     CopyTextureToBuffer {
-        src: TextureCopyView,
-        dst: BufferCopyView,
+        src: crate::command::TextureCopyView,
+        dst: crate::command::BufferCopyView,
         size: wgt::Extent3d,
     },
     CopyTextureToTexture {
-        src: TextureCopyView,
-        dst: TextureCopyView,
+        src: crate::command::TextureCopyView,
+        dst: crate::command::TextureCopyView,
         size: wgt::Extent3d,
     },
     RunComputePass {
diff --git a/wgpu-core/src/hub.rs b/wgpu-core/src/hub.rs
index 39e22ffe1..8ff49b97d 100644
--- a/wgpu-core/src/hub.rs
+++ b/wgpu-core/src/hub.rs
@@ -174,7 +174,7 @@ impl<B: hal::Backend> Access<SwapChain<B>> for Root {}
 impl<B: hal::Backend> Access<SwapChain<B>> for Device<B> {}
 impl<B: hal::Backend> Access<PipelineLayout<B>> for Root {}
 impl<B: hal::Backend> Access<PipelineLayout<B>> for Device<B> {}
-impl<B: hal::Backend> Access<PipelineLayout<B>> for CommandBuffer<B> {}
+impl<B: hal::Backend> Access<PipelineLayout<B>> for RenderBundle {}
 impl<B: hal::Backend> Access<BindGroupLayout<B>> for Root {}
 impl<B: hal::Backend> Access<BindGroupLayout<B>> for Device<B> {}
 impl<B: hal::Backend> Access<BindGroupLayout<B>> for PipelineLayout<B> {}
@@ -187,6 +187,7 @@ impl<B: hal::Backend> Access<CommandBuffer<B>> for Root {}
 impl<B: hal::Backend> Access<CommandBuffer<B>> for Device<B> {}
 impl<B: hal::Backend> Access<CommandBuffer<B>> for SwapChain<B> {}
 impl<B: hal::Backend> Access<RenderBundle> for Device<B> {}
+impl<B: hal::Backend> Access<RenderBundle> for CommandBuffer<B> {}
 impl<B: hal::Backend> Access<ComputePipeline<B>> for Device<B> {}
 impl<B: hal::Backend> Access<ComputePipeline<B>> for BindGroup<B> {}
 impl<B: hal::Backend> Access<RenderPipeline<B>> for Device<B> {}
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index ca9170e03..726184cbc 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -127,7 +127,6 @@ pub struct RenderPipeline<B: hal::Backend> {
     pub(crate) pass_context: RenderPassContext,
     pub(crate) flags: PipelineFlags,
     pub(crate) index_format: IndexFormat,
-    pub(crate) sample_count: u8,
     pub(crate) vertex_strides: Vec<(BufferAddress, InputStepMode)>,
     pub(crate) life_guard: LifeGuard,
 }
diff --git a/wgpu-core/src/track/mod.rs b/wgpu-core/src/track/mod.rs
index b3c67d9db..95965590a 100644
--- a/wgpu-core/src/track/mod.rs
+++ b/wgpu-core/src/track/mod.rs
@@ -237,11 +237,6 @@ impl<S: ResourceState> ResourceTracker<S> {
         self.map.clear();
     }
 
-    /// Returns true if the tracker is empty.
-    pub fn is_empty(&self) -> bool {
-        self.map.is_empty()
-    }
-
     /// Initialize a resource to be used.
     ///
     /// Returns false if the resource is already registered.
@@ -444,6 +439,7 @@ pub(crate) struct TrackerSet {
     pub samplers: ResourceTracker<PhantomData<id::SamplerId>>,
     pub compute_pipes: ResourceTracker<PhantomData<id::ComputePipelineId>>,
     pub render_pipes: ResourceTracker<PhantomData<id::RenderPipelineId>>,
+    pub bundles: ResourceTracker<PhantomData<id::RenderBundleId>>,
 }
 
 impl TrackerSet {
@@ -457,6 +453,7 @@ impl TrackerSet {
             samplers: ResourceTracker::new(backend),
             compute_pipes: ResourceTracker::new(backend),
             render_pipes: ResourceTracker::new(backend),
+            bundles: ResourceTracker::new(backend),
         }
     }
 
@@ -469,6 +466,7 @@ impl TrackerSet {
         self.samplers.clear();
         self.compute_pipes.clear();
         self.render_pipes.clear();
+        self.bundles.clear();
     }
 
     /// Try to optimize the tracking representation.
@@ -480,6 +478,7 @@ impl TrackerSet {
         self.samplers.optimize();
         self.compute_pipes.optimize();
         self.render_pipes.optimize();
+        self.bundles.optimize();
     }
 
     /// Merge all the trackers of another instance by extending
@@ -494,6 +493,7 @@ impl TrackerSet {
             .merge_extend(&other.compute_pipes)
             .unwrap();
         self.render_pipes.merge_extend(&other.render_pipes).unwrap();
+        self.bundles.merge_extend(&other.bundles).unwrap();
     }
 
     pub fn backend(&self) -> wgt::Backend {
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 516094c64..658c7feba 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -10,15 +10,17 @@ use serde::Deserialize;
 use serde::Serialize;
 use std::{io, slice};
 
+pub type BufferAddress = u64;
+
 /// Buffer-Texture copies on command encoders have to have the `bytes_per_row`
 /// aligned to this number.
 ///
 /// This doesn't apply to `Queue::write_texture`.
 pub const COPY_BYTES_PER_ROW_ALIGNMENT: u32 = 256;
 /// Bound uniform/storage buffer offsets must be aligned to this number.
-pub const BIND_BUFFER_ALIGNMENT: u64 = 256;
+pub const BIND_BUFFER_ALIGNMENT: BufferAddress = 256;
 /// Buffer to buffer copy offsets and sizes must be aligned to this number
-pub const COPY_BUFFER_ALIGNMENT: u64 = 4;
+pub const COPY_BUFFER_ALIGNMENT: BufferAddress = 4;
 
 #[repr(transparent)]
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -33,7 +35,7 @@ pub const COPY_BUFFER_ALIGNMENT: u64 = 4;
     derive(serde::Deserialize),
     serde(from = "SerBufferSize")
 )]
-pub struct BufferSize(pub u64);
+pub struct BufferSize(pub BufferAddress);
 
 impl BufferSize {
     pub const WHOLE: BufferSize = BufferSize(!0);
@@ -295,8 +297,6 @@ pub enum TextureViewDimension {
     D3,
 }
 
-pub type BufferAddress = u64;
-
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)]
 #[cfg_attr(feature = "trace", derive(Serialize))]
@@ -545,6 +545,12 @@ pub enum IndexFormat {
     Uint32 = 1,
 }
 
+impl Default for IndexFormat {
+    fn default() -> Self {
+        IndexFormat::Uint32
+    }
+}
+
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)]
 #[cfg_attr(feature = "trace", derive(Serialize))]