Fix GL Push Constant Layout (#4607)

* It verks! * More tests * Fixes * Working multi-stage push constants * Comments * Add push constant partial update teste * Docs * Update Cargo.toml * Comments
2024-11-22 06:44:14 +00:00 · 2023-11-06 07:58:26 -05:00 · 2023-11-06 07:58:26 -05:00 · 7f72c9fc3b
commit 7f72c9fc3b
parent 267bd488d3
30 changed files with 989 additions and 249 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1567,8 +1567,7 @@ checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945"
 [[package]]
 name = "glow"
 version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "886c2a30b160c4c6fec8f987430c26b526b7988ca71f664e6a699ddf6f9601e4"
+source = "git+https://github.com/grovesNL/glow.git?rev=29ff917a2b2ff7ce0a81b2cc5681de6d4735b36e#29ff917a2b2ff7ce0a81b2cc5681de6d4735b36e"
 dependencies = [
 "js-sys",
 "slotmap",
--- a/naga/src/back/glsl/mod.rs
+++ b/naga/src/back/glsl/mod.rs
@ -309,6 +309,8 @@ pub struct ReflectionInfo {
    pub uniforms: crate::FastHashMap<Handle<crate::GlobalVariable>, String>,
    /// Mapping between names and attribute locations.
    pub varying: crate::FastHashMap<String, VaryingLocation>,
+    /// List of push constant items in the shader.
+    pub push_constant_items: Vec<PushConstantItem>,
 }

 /// Mapping between a texture and its sampler, if it exists.
@ -328,6 +330,50 @@ pub struct TextureMapping {
    pub sampler: Option<Handle<crate::GlobalVariable>>,
 }

+/// All information to bind a single uniform value to the shader.
+///
+/// Push constants are emulated using traditional uniforms in OpenGL.
+///
+/// These are composed of a set of primatives (scalar, vector, matrix) that
+/// are given names. Because they are not backed by the concept of a buffer,
+/// we must do the work of calculating the offset of each primative in the
+/// push constant block.
+#[derive(Debug, Clone)]
+pub struct PushConstantItem {
+    /// GL uniform name for the item. This name is the same as if you were
+    /// to access it directly from a GLSL shader.
+    ///
+    /// The with the following example, the following names will be generated,
+    /// one name per GLSL uniform.
+    ///
+    /// ```glsl
+    /// struct InnerStruct {
+    ///     value: f32,
+    /// }
+    ///
+    /// struct PushConstant {
+    ///     InnerStruct inner;
+    ///     vec4 array[2];
+    /// }
+    ///
+    /// uniform PushConstants _push_constant_binding_cs;
+    /// ```
+    ///
+    /// ```text
+    /// - _push_constant_binding_cs.inner.value
+    /// - _push_constant_binding_cs.array[0]
+    /// - _push_constant_binding_cs.array[1]
+    /// ```
+    ///
+    pub access_path: String,
+    /// Type of the uniform. This will only ever be a scalar, vector, or matrix.
+    pub ty: Handle<crate::Type>,
+    /// The offset in the push constant memory block this uniform maps to.
+    ///
+    /// The size of the uniform can be derived from the type.
+    pub offset: u32,
+}
+
 /// Helper structure that generates a number
 #[derive(Default)]
 struct IdGenerator(u32);
@ -1264,8 +1310,8 @@ impl<'a, W: Write> Writer<'a, W> {
        handle: Handle<crate::GlobalVariable>,
        global: &crate::GlobalVariable,
    ) -> String {
-        match global.binding {
-            Some(ref br) => {
+        match (&global.binding, global.space) {
+            (&Some(ref br), _) => {
                format!(
                    "_group_{}_binding_{}_{}",
                    br.group,
@ -1273,7 +1319,10 @@ impl<'a, W: Write> Writer<'a, W> {
                    self.entry_point.stage.to_str()
                )
            }
-            None => self.names[&NameKey::GlobalVariable(handle)].clone(),
+            (&None, crate::AddressSpace::PushConstant) => {
+                format!("_push_constant_binding_{}", self.entry_point.stage.to_str())
+            }
+            (&None, _) => self.names[&NameKey::GlobalVariable(handle)].clone(),
        }
    }

@ -1283,15 +1332,20 @@ impl<'a, W: Write> Writer<'a, W> {
        handle: Handle<crate::GlobalVariable>,
        global: &crate::GlobalVariable,
    ) -> BackendResult {
-        match global.binding {
-            Some(ref br) => write!(
+        match (&global.binding, global.space) {
+            (&Some(ref br), _) => write!(
                self.out,
                "_group_{}_binding_{}_{}",
                br.group,
                br.binding,
                self.entry_point.stage.to_str()
            )?,
-            None => write!(
+            (&None, crate::AddressSpace::PushConstant) => write!(
+                self.out,
+                "_push_constant_binding_{}",
+                self.entry_point.stage.to_str()
+            )?,
+            (&None, _) => write!(
                self.out,
                "{}",
                &self.names[&NameKey::GlobalVariable(handle)]
@ -4069,6 +4123,7 @@ impl<'a, W: Write> Writer<'a, W> {
            }
        }

+        let mut push_constant_info = None;
        for (handle, var) in self.module.global_variables.iter() {
            if info[handle].is_empty() {
                continue;
@ -4093,17 +4148,105 @@ impl<'a, W: Write> Writer<'a, W> {
                        let name = self.reflection_names_globals[&handle].clone();
                        uniforms.insert(handle, name);
                    }
+                    crate::AddressSpace::PushConstant => {
+                        let name = self.reflection_names_globals[&handle].clone();
+                        push_constant_info = Some((name, var.ty));
+                    }
                    _ => (),
                },
            }
        }

+        let mut push_constant_segments = Vec::new();
+        let mut push_constant_items = vec![];
+
+        if let Some((name, ty)) = push_constant_info {
+            // We don't have a layouter available to us, so we need to create one.
+            //
+            // This is potentially a bit wasteful, but the set of types in the program
+            // shouldn't be too large.
+            let mut layouter = crate::proc::Layouter::default();
+            layouter.update(self.module.to_ctx()).unwrap();
+
+            // We start with the name of the binding itself.
+            push_constant_segments.push(name);
+
+            // We then recursively collect all the uniform fields of the push constant.
+            self.collect_push_constant_items(
+                ty,
+                &mut push_constant_segments,
+                &layouter,
+                &mut 0,
+                &mut push_constant_items,
+            );
+        }
+
        Ok(ReflectionInfo {
            texture_mapping,
            uniforms,
            varying: mem::take(&mut self.varying),
+            push_constant_items,
        })
    }
+
+    fn collect_push_constant_items(
+        &mut self,
+        ty: Handle<crate::Type>,
+        segments: &mut Vec<String>,
+        layouter: &crate::proc::Layouter,
+        offset: &mut u32,
+        items: &mut Vec<PushConstantItem>,
+    ) {
+        // At this point in the recursion, `segments` contains the path
+        // needed to access `ty` from the root.
+
+        let layout = &layouter[ty];
+        *offset = layout.alignment.round_up(*offset);
+        match self.module.types[ty].inner {
+            // All these types map directly to GL uniforms.
+            TypeInner::Scalar { .. } | TypeInner::Vector { .. } | TypeInner::Matrix { .. } => {
+                // Build the full name, by combining all current segments.
+                let name: String = segments.iter().map(String::as_str).collect();
+                items.push(PushConstantItem {
+                    access_path: name,
+                    offset: *offset,
+                    ty,
+                });
+                *offset += layout.size;
+            }
+            // Arrays are recursed into.
+            TypeInner::Array { base, size, .. } => {
+                let crate::ArraySize::Constant(count) = size else {
+                    unreachable!("Cannot have dynamic arrays in push constants");
+                };
+
+                for i in 0..count.get() {
+                    // Add the array accessor and recurse.
+                    segments.push(format!("[{}]", i));
+                    self.collect_push_constant_items(base, segments, layouter, offset, items);
+                    segments.pop();
+                }
+
+                // Ensure the stride is kept by rounding up to the alignment.
+                *offset = layout.alignment.round_up(*offset)
+            }
+            TypeInner::Struct { ref members, .. } => {
+                for (index, member) in members.iter().enumerate() {
+                    // Add struct accessor and recurse.
+                    segments.push(format!(
+                        ".{}",
+                        self.names[&NameKey::StructMember(ty, index as u32)]
+                    ));
+                    self.collect_push_constant_items(member.ty, segments, layouter, offset, items);
+                    segments.pop();
+                }
+
+                // Ensure ending padding is kept by rounding up to the alignment.
+                *offset = layout.alignment.round_up(*offset)
+            }
+            _ => unreachable!(),
+        }
+    }
 }

 /// Structure returned by [`glsl_scalar`]
--- a/naga/tests/out/glsl/push-constants.main.Fragment.glsl
+++ b/naga/tests/out/glsl/push-constants.main.Fragment.glsl
@ -9,14 +9,14 @@ struct PushConstants {
 struct FragmentIn {
    vec4 color;
 };
-uniform PushConstants pc;
+uniform PushConstants _push_constant_binding_fs;

 layout(location = 0) smooth in vec4 _vs2fs_location0;
 layout(location = 0) out vec4 _fs2p_location0;

 void main() {
    FragmentIn in_ = FragmentIn(_vs2fs_location0);
-    float _e4 = pc.multiplier;
+    float _e4 = _push_constant_binding_fs.multiplier;
    _fs2p_location0 = (in_.color * _e4);
    return;
 }
--- a/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl
+++ b/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl
@ -9,14 +9,14 @@ struct PushConstants {
 struct FragmentIn {
    vec4 color;
 };
-uniform PushConstants pc;
+uniform PushConstants _push_constant_binding_vs;

 layout(location = 0) in vec2 _p2vs_location0;

 void main() {
    vec2 pos = _p2vs_location0;
    uint vi = uint(gl_VertexID);
-    float _e5 = pc.multiplier;
+    float _e5 = _push_constant_binding_vs.multiplier;
    gl_Position = vec4(((float(vi) * _e5) * pos), 0.0, 1.0);
    return;
 }
--- a/tests/src/image.rs
+++ b/tests/src/image.rs
@ -625,12 +625,16 @@ impl ReadbackBuffers {
        buffer_zero && stencil_buffer_zero
    }

-    pub fn check_buffer_contents(&self, device: &Device, expected_data: &[u8]) -> bool {
-        let result = self
-            .retrieve_buffer(device, &self.buffer, self.buffer_aspect())
-            .iter()
-            .eq(expected_data.iter());
+    pub fn assert_buffer_contents(&self, device: &Device, expected_data: &[u8]) {
+        let result_buffer = self.retrieve_buffer(device, &self.buffer, self.buffer_aspect());
+        assert!(
+            result_buffer.len() >= expected_data.len(),
+            "Result buffer ({}) smaller than expected buffer ({})",
+            result_buffer.len(),
+            expected_data.len()
+        );
+        let result_buffer = &result_buffer[..expected_data.len()];
+        assert_eq!(result_buffer, expected_data);
        self.buffer.unmap();
-        result
    }
 }
--- a/tests/tests/gpu.rs
+++ b/tests/tests/gpu.rs
@ -1,4 +1,5 @@
 mod regression {
+    mod issue_3349;
    mod issue_3457;
    mod issue_4024;
    mod issue_4122;
@ -19,6 +20,7 @@ mod occlusion_query;
 mod partially_bounded_arrays;
 mod pipeline;
 mod poll;
+mod push_constants;
 mod query_set;
 mod queue_transfer;
 mod resource_descriptor_accessor;
--- a/tests/tests/partially_bounded_arrays/mod.rs
+++ b/tests/tests/partially_bounded_arrays/mod.rs
@ -97,9 +97,6 @@ static PARTIALLY_BOUNDED_ARRAY: GpuTestConfiguration = GpuTestConfiguration::new

        ctx.queue.submit(Some(encoder.finish()));

-        assert!(
-            readback_buffers
-                .check_buffer_contents(device, bytemuck::bytes_of(&[4.0f32, 3.0, 2.0, 1.0])),
-            "texture storage values are incorrect!"
-        );
+        readback_buffers
+            .assert_buffer_contents(device, bytemuck::bytes_of(&[4.0f32, 3.0, 2.0, 1.0]));
    });
--- a/tests/tests/push_constants.rs
+++ b/tests/tests/push_constants.rs
@ -0,0 +1,151 @@
+use std::num::NonZeroU64;
+
+use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext};
+
+/// We want to test that partial updates to push constants work as expected.
+///
+/// As such, we dispatch two compute passes, one which writes the values
+/// before a parital update, and one which writes the values after the partial update.
+///
+/// If the update code is working correctly, the values not written to by the second update
+/// will remain unchanged.
+#[gpu_test]
+static PARTIAL_UPDATE: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .features(wgpu::Features::PUSH_CONSTANTS)
+            .limits(wgpu::Limits {
+                max_push_constant_size: 32,
+                ..Default::default()
+            }),
+    )
+    .run_sync(partial_update_test);
+
+const SHADER: &str = r#"
+    struct Pc {
+        offset: u32,
+        vector: vec4f,
+    }
+
+    var<push_constant> pc: Pc;
+
+    @group(0) @binding(0)
+    var<storage, read_write> output: array<vec4f>;
+
+    @compute @workgroup_size(1)
+    fn main() {
+        output[pc.offset] = pc.vector;
+    }
+"#;
+
+fn partial_update_test(ctx: TestingContext) {
+    let sm = ctx
+        .device
+        .create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("shader"),
+            source: wgpu::ShaderSource::Wgsl(SHADER.into()),
+        });
+
+    let bgl = ctx
+        .device
+        .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("bind_group_layout"),
+            entries: &[wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: NonZeroU64::new(16),
+                },
+                count: None,
+            }],
+        });
+
+    let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("gpu_buffer"),
+        size: 32,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+        mapped_at_creation: false,
+    });
+
+    let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("cpu_buffer"),
+        size: 32,
+        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+        mapped_at_creation: false,
+    });
+
+    let bind_group = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: Some("bind_group"),
+        layout: &bgl,
+        entries: &[wgpu::BindGroupEntry {
+            binding: 0,
+            resource: gpu_buffer.as_entire_binding(),
+        }],
+    });
+
+    let pipeline_layout = ctx
+        .device
+        .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("pipeline_layout"),
+            bind_group_layouts: &[&bgl],
+            push_constant_ranges: &[wgpu::PushConstantRange {
+                stages: wgpu::ShaderStages::COMPUTE,
+                range: 0..32,
+            }],
+        });
+
+    let pipeline = ctx
+        .device
+        .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("pipeline"),
+            layout: Some(&pipeline_layout),
+            module: &sm,
+            entry_point: "main",
+        });
+
+    let mut encoder = ctx
+        .device
+        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("encoder"),
+        });
+
+    {
+        let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+            label: Some("compute_pass"),
+            timestamp_writes: None,
+        });
+        cpass.set_pipeline(&pipeline);
+        cpass.set_bind_group(0, &bind_group, &[]);
+
+        // -- Dispatch 0 --
+
+        // Dispatch number
+        cpass.set_push_constants(0, bytemuck::bytes_of(&[0_u32]));
+        // Update the whole vector.
+        cpass.set_push_constants(16, bytemuck::bytes_of(&[1.0_f32, 2.0, 3.0, 4.0]));
+        cpass.dispatch_workgroups(1, 1, 1);
+
+        // -- Dispatch 1 --
+
+        // Dispatch number
+        cpass.set_push_constants(0, bytemuck::bytes_of(&[1_u32]));
+        // Update just the y component of the vector.
+        cpass.set_push_constants(20, bytemuck::bytes_of(&[5.0_f32]));
+        cpass.dispatch_workgroups(1, 1, 1);
+    }
+
+    encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, 32);
+    ctx.queue.submit([encoder.finish()]);
+    cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());
+    ctx.device.poll(wgpu::Maintain::Wait);
+
+    let data = cpu_buffer.slice(..).get_mapped_range();
+
+    let floats: &[f32] = bytemuck::cast_slice(&data);
+
+    // first 4 floats the initial value
+    // second 4 floats the first update
+    assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 1.0, 5.0, 3.0, 4.0]);
+}
--- a/tests/tests/regression/issue_3349.fs.wgsl
+++ b/tests/tests/regression/issue_3349.fs.wgsl
@ -0,0 +1,46 @@
+struct ShaderData {
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+}
+
+@group(0) @binding(0)
+var<uniform> data1: ShaderData;
+
+var<push_constant> data2: ShaderData;
+
+struct FsIn {
+    @builtin(position) position: vec4f,
+    @location(0) data1: vec4f,
+    @location(1) data2: vec4f,
+}
+
+@fragment
+fn fs_main(fs_in: FsIn) -> @location(0) vec4f {
+    let floored = vec2u(floor(fs_in.position.xy));
+    // We're outputting a 2x2 image, each pixel coming from a different source
+    let serial = floored.x + floored.y * 2u;
+
+    switch serial {
+        // (0, 0) - uniform buffer from the vertex shader
+        case 0u: {
+            return fs_in.data1;
+        }
+        // (1, 0) - push constant from the vertex shader
+        case 1u: {
+            return fs_in.data2;
+        }
+        // (0, 1) - uniform buffer from the fragment shader
+        case 2u: {
+            return vec4f(data1.a, data1.b, data1.c, data1.d);
+        }
+        // (1, 1) - push constant from the fragment shader
+        case 3u: {
+            return vec4f(data2.a, data2.b, data2.c, data2.d);
+        }
+        default: {
+            return vec4f(0.0);
+        }
+    }
+}
--- a/tests/tests/regression/issue_3349.rs
+++ b/tests/tests/regression/issue_3349.rs
@ -0,0 +1,178 @@
+use wgpu::util::DeviceExt;
+use wgpu_test::{
+    gpu_test, image::ReadbackBuffers, GpuTestConfiguration, TestParameters, TestingContext,
+};
+
+/// We thought we had an OpenGL bug that, when running without explicit in-shader locations,
+/// we will not properly bind uniform buffers to both the vertex and fragment
+/// shaders. This turned out to not reproduce at all with this test case.
+///
+/// However, it also caught issues with the push constant implementation,
+/// making sure that it works correctly with different definitions for the push constant
+/// block in vertex and fragment shaders.
+///
+/// This test needs to be able to run on GLES 3.0
+///
+/// What this test does is render a 2x2 texture. Each pixel corresponds to a different
+/// data source.
+///
+/// top left: Vertex Shader / Uniform Buffer
+/// top right: Vertex Shader / Push Constant
+/// bottom left: Fragment Shader / Uniform Buffer
+/// bottom right: Fragment Shader / Push Constant
+///
+/// We then validate the data is correct from every position.
+#[gpu_test]
+static MULTI_STAGE_DATA_BINDING: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .features(wgpu::Features::PUSH_CONSTANTS)
+            .limits(wgpu::Limits {
+                max_push_constant_size: 16,
+                ..Default::default()
+            }),
+    )
+    .run_sync(multi_stage_data_binding_test);
+
+fn multi_stage_data_binding_test(ctx: TestingContext) {
+    // We use different shader modules to allow us to use different
+    // types for the uniform and push constant blocks between stages.
+    let vs_sm = ctx
+        .device
+        .create_shader_module(wgpu::include_wgsl!("issue_3349.vs.wgsl"));
+
+    let fs_sm = ctx
+        .device
+        .create_shader_module(wgpu::include_wgsl!("issue_3349.fs.wgsl"));
+
+    // We start with u8s then convert to float, to make sure we don't have
+    // cross-vendor rounding issues unorm.
+    let input_as_unorm: [u8; 4] = [25_u8, 50, 75, 100];
+    let input = input_as_unorm.map(|v| v as f32 / 255.0);
+
+    let buffer = ctx
+        .device
+        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("buffer"),
+            contents: bytemuck::cast_slice(&input),
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+        });
+
+    let bgl = ctx
+        .device
+        .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("bgl"),
+            entries: &[wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                visibility: wgpu::ShaderStages::VERTEX_FRAGMENT,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Uniform,
+                    has_dynamic_offset: false,
+                    min_binding_size: None,
+                },
+                count: None,
+            }],
+        });
+
+    let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: Some("bg"),
+        layout: &bgl,
+        entries: &[wgpu::BindGroupEntry {
+            binding: 0,
+            resource: buffer.as_entire_binding(),
+        }],
+    });
+
+    let pll = ctx
+        .device
+        .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("pll"),
+            bind_group_layouts: &[&bgl],
+            push_constant_ranges: &[wgpu::PushConstantRange {
+                stages: wgpu::ShaderStages::VERTEX_FRAGMENT,
+                range: 0..16,
+            }],
+        });
+
+    let pipeline = ctx
+        .device
+        .create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("pipeline"),
+            layout: Some(&pll),
+            vertex: wgpu::VertexState {
+                module: &vs_sm,
+                entry_point: "vs_main",
+                buffers: &[],
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &fs_sm,
+                entry_point: "fs_main",
+                targets: &[Some(wgpu::ColorTargetState {
+                    format: wgpu::TextureFormat::Rgba8Unorm,
+                    blend: None,
+                    write_mask: wgpu::ColorWrites::ALL,
+                })],
+            }),
+            primitive: wgpu::PrimitiveState::default(),
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+        });
+
+    let texture = ctx.device.create_texture(&wgpu::TextureDescriptor {
+        label: Some("texture"),
+        size: wgpu::Extent3d {
+            width: 2,
+            height: 2,
+            depth_or_array_layers: 1,
+        },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        // Important: NOT srgb.
+        format: wgpu::TextureFormat::Rgba8Unorm,
+        usage: wgpu::TextureUsages::COPY_SRC | wgpu::TextureUsages::RENDER_ATTACHMENT,
+        view_formats: &[],
+    });
+
+    let view = texture.create_view(&wgpu::TextureViewDescriptor::default());
+
+    let mut encoder = ctx
+        .device
+        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("encoder"),
+        });
+
+    {
+        let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: Some("rpass"),
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+
+        rpass.set_pipeline(&pipeline);
+        rpass.set_bind_group(0, &bg, &[]);
+        rpass.set_push_constants(
+            wgpu::ShaderStages::VERTEX_FRAGMENT,
+            0,
+            bytemuck::cast_slice(&input),
+        );
+        rpass.draw(0..3, 0..1);
+    }
+
+    let buffers = ReadbackBuffers::new(&ctx.device, &texture);
+    buffers.copy_from(&ctx.device, &mut encoder, &texture);
+    ctx.queue.submit([encoder.finish()]);
+
+    let result = input_as_unorm.repeat(4);
+    buffers.assert_buffer_contents(&ctx.device, &result);
+}
--- a/tests/tests/regression/issue_3349.vs.wgsl
+++ b/tests/tests/regression/issue_3349.vs.wgsl
@ -0,0 +1,22 @@
+@group(0) @binding(0)
+var<uniform> data1: vec4f;
+
+// D3DCompile requires this to be a struct
+struct Pc {
+    inner: vec4f,
+}
+
+var<push_constant> data2: Pc;
+
+struct VsOut {
+    @builtin(position) position: vec4f,
+    @location(0) data1: vec4f,
+    @location(1) data2: vec4f,
+}
+
+@vertex
+fn vs_main(@builtin(vertex_index) vertexIndex: u32) -> VsOut {
+    let uv = vec2f(f32((vertexIndex << 1u) & 2u), f32(vertexIndex & 2u));
+    let position = vec4f(uv * 2.0 - 1.0, 0.0, 1.0);
+    return VsOut(position, data1, data2.inner);
+}
--- a/tests/tests/scissor_tests/mod.rs
+++ b/tests/tests/scissor_tests/mod.rs
@ -94,7 +94,7 @@ fn scissor_test_impl(ctx: &TestingContext, scissor_rect: Rect, expected_data: [u
        readback_buffer.copy_from(&ctx.device, &mut encoder, &texture);
        ctx.queue.submit(Some(encoder.finish()));
    }
-    assert!(readback_buffer.check_buffer_contents(&ctx.device, &expected_data));
+    readback_buffer.assert_buffer_contents(&ctx.device, &expected_data);
 }

 #[gpu_test]
--- a/tests/tests/shader/mod.rs
+++ b/tests/tests/shader/mod.rs
@ -40,6 +40,8 @@ impl InputStorageType {
 struct ShaderTest {
    /// Human readable name
    name: String,
+    /// Header text. This is arbitrary code injected at the top of the shader. Replaces {{header}}
+    header: String,
    /// This text will be the body of the `Input` struct. Replaces "{{input_members}}"
    /// in the shader_test shader.
    custom_struct_members: String,
@ -132,6 +134,7 @@ impl ShaderTest {
    ) -> Self {
        Self {
            name,
+            header: String::new(),
            custom_struct_members,
            body,
            input_type: String::from("CustomStruct"),
@ -144,6 +147,12 @@ impl ShaderTest {
        }
    }

+    fn header(mut self, header: String) -> Self {
+        self.header = header;
+
+        self
+    }
+
    /// Add another set of possible outputs. If any of the given
    /// output values are seen it's considered a success (i.e. this is OR, not AND).
    ///
@ -272,6 +281,7 @@ fn shader_input_output_test(
        // This isn't terribly efficient but the string is short and it's a test.
        // The body and input members are the longest part, so do them last.
        let mut processed = source
+            .replace("{{header}}", &test.header)
            .replace("{{storage_type}}", storage_type.as_str())
            .replace("{{input_type}}", &test.input_type)
            .replace("{{output_type}}", &test.output_type)
--- a/tests/tests/shader/shader_test.wgsl
+++ b/tests/tests/shader/shader_test.wgsl
@ -1,3 +1,5 @@
+{{header}}
+
 struct CustomStruct {
    {{input_members}}
 }
--- a/tests/tests/shader/struct_layout.rs
+++ b/tests/tests/shader/struct_layout.rs
@ -99,7 +99,7 @@ fn create_struct_layout_tests(storage_type: InputStorageType) -> Vec<ShaderTest>
                }
            }

-            // https://github.com/gfx-rs/naga/issues/1785
+            // https://github.com/gfx-rs/wgpu/issues/4371
            let failures = if storage_type == InputStorageType::Uniform && rows == 2 {
                Backends::GL
            } else {
@ -171,6 +171,51 @@ fn create_struct_layout_tests(storage_type: InputStorageType) -> Vec<ShaderTest>
        }
    }

+    // Nested struct and array test.
+    //
+    // This tries to exploit all the weird edge cases of the struct layout algorithm.
+    {
+        let header =
+            String::from("struct Inner { scalar: f32, member: array<vec3<f32>, 2>, scalar2: f32 }");
+        let members = String::from("inner: Inner, scalar3: f32, vector: vec3<f32>, scalar4: f32");
+        let direct = String::from(
+            "\
+            output[0] = bitcast<u32>(input.inner.scalar);
+            output[1] = bitcast<u32>(input.inner.member[0].x);
+            output[2] = bitcast<u32>(input.inner.member[0].y);
+            output[3] = bitcast<u32>(input.inner.member[0].z);
+            output[4] = bitcast<u32>(input.inner.member[1].x);
+            output[5] = bitcast<u32>(input.inner.member[1].y);
+            output[6] = bitcast<u32>(input.inner.member[1].z);
+            output[7] = bitcast<u32>(input.inner.scalar2);
+            output[8] = bitcast<u32>(input.scalar3);
+            output[9] = bitcast<u32>(input.vector.x);
+            output[10] = bitcast<u32>(input.vector.y);
+            output[11] = bitcast<u32>(input.vector.z);
+            output[12] = bitcast<u32>(input.scalar4);
+        ",
+        );
+
+        tests.push(
+            ShaderTest::new(
+                String::from("nested struct and array"),
+                members,
+                direct,
+                &input_values,
+                &[
+                    0, // inner.scalar
+                    4, 5, 6, // inner.member[0]
+                    8, 9, 10, // inner.member[1]
+                    12, // scalar2
+                    16, // scalar3
+                    20, 21, 22, // vector
+                    23, // scalar4
+                ],
+            )
+            .header(header),
+        );
+    }
+
    tests
 }

@ -215,8 +260,7 @@ static PUSH_CONSTANT_INPUT: GpuTestConfiguration = GpuTestConfiguration::new()
            .limits(Limits {
                max_push_constant_size: MAX_BUFFER_SIZE as u32,
                ..Limits::downlevel_defaults()
-            })
-            .expect_fail(FailureCase::backend(Backends::GL)),
+            }),
    )
    .run_sync(|ctx| {
        shader_input_output_test(
--- a/tests/tests/shader_primitive_index/mod.rs
+++ b/tests/tests/shader_primitive_index/mod.rs
@ -192,5 +192,5 @@ fn pulling_common(
    }
    readback_buffer.copy_from(&ctx.device, &mut encoder, &color_texture);
    ctx.queue.submit(Some(encoder.finish()));
-    assert!(readback_buffer.check_buffer_contents(&ctx.device, expected));
+    readback_buffer.assert_buffer_contents(&ctx.device, expected);
 }
--- a/wgpu-hal/Cargo.toml
+++ b/wgpu-hal/Cargo.toml
@ -96,7 +96,7 @@ rustc-hash = "1.1"
 log = "0.4"

 # backend: Gles
-glow = { version = "0.13", optional = true }
+glow = { version = "0.13", git = "https://github.com/grovesNL/glow.git", rev = "29ff917a2b2ff7ce0a81b2cc5681de6d4735b36e", optional = true }

 [dependencies.wgt]
 package = "wgpu-types"
@ -180,7 +180,9 @@ features = ["wgsl-in"]
 [dev-dependencies]
 cfg-if = "1"
 env_logger = "0.10"
-winit = { version = "0.29.2", features = [ "android-native-activity" ] }   # for "halmark" example
+winit = { version = "0.29.2", features = [
+    "android-native-activity",
+] } # for "halmark" example

 [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
 glutin = "0.29.1" # for "gles" example
--- a/wgpu-hal/src/dx11/command.rs
+++ b/wgpu-hal/src/dx11/command.rs
@ -96,7 +96,7 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
        &mut self,
        layout: &super::PipelineLayout,
        stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
        todo!()
--- a/wgpu-hal/src/dx12/command.rs
+++ b/wgpu-hal/src/dx12/command.rs
@ -911,15 +911,16 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
        &mut self,
        layout: &super::PipelineLayout,
        _stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
+        let offset_words = offset_bytes as usize / 4;
+
        let info = layout.shared.root_constant_info.as_ref().unwrap();

        self.pass.root_elements[info.root_index as usize] = super::RootElement::Constant;

-        self.pass.constant_data[(offset as usize)..(offset as usize + data.len())]
-            .copy_from_slice(data);
+        self.pass.constant_data[offset_words..(offset_words + data.len())].copy_from_slice(data);

        if self.pass.layout.signature == layout.shared.signature {
            self.pass.dirty_root_elements |= 1 << info.root_index;
--- a/wgpu-hal/src/empty.rs
+++ b/wgpu-hal/src/empty.rs
@ -327,7 +327,7 @@ impl crate::CommandEncoder<Api> for Encoder {
        &mut self,
        layout: &Resource,
        stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
    }
--- a/wgpu-hal/src/gles/command.rs
+++ b/wgpu-hal/src/gles/command.rs
@ -8,7 +8,6 @@ struct TextureSlotDesc {
    sampler_index: Option<u8>,
 }

-#[derive(Default)]
 pub(super) struct State {
    topology: u32,
    primitive: super::PrimitiveState,
@ -30,10 +29,41 @@ pub(super) struct State {
    instance_vbuf_mask: usize,
    dirty_vbuf_mask: usize,
    active_first_instance: u32,
-    push_offset_to_uniform: ArrayVec<super::UniformDesc, { super::MAX_PUSH_CONSTANTS }>,
+    push_constant_descs: ArrayVec<super::PushConstantDesc, { super::MAX_PUSH_CONSTANT_COMMANDS }>,
+    // The current state of the push constant data block.
+    current_push_constant_data: [u32; super::MAX_PUSH_CONSTANTS],
    end_of_pass_timestamp: Option<glow::Query>,
 }

+impl Default for State {
+    fn default() -> Self {
+        Self {
+            topology: Default::default(),
+            primitive: Default::default(),
+            index_format: Default::default(),
+            index_offset: Default::default(),
+            vertex_buffers: Default::default(),
+            vertex_attributes: Default::default(),
+            color_targets: Default::default(),
+            stencil: Default::default(),
+            depth_bias: Default::default(),
+            alpha_to_coverage_enabled: Default::default(),
+            samplers: Default::default(),
+            texture_slots: Default::default(),
+            render_size: Default::default(),
+            resolve_attachments: Default::default(),
+            invalidate_attachments: Default::default(),
+            has_pass_label: Default::default(),
+            instance_vbuf_mask: Default::default(),
+            dirty_vbuf_mask: Default::default(),
+            active_first_instance: Default::default(),
+            push_constant_descs: Default::default(),
+            current_push_constant_data: [0; super::MAX_PUSH_CONSTANTS],
+            end_of_pass_timestamp: Default::default(),
+        }
+    }
+}
+
 impl super::CommandBuffer {
    fn clear(&mut self) {
        self.label = None;
@ -176,10 +206,7 @@ impl super::CommandEncoder {
    fn set_pipeline_inner(&mut self, inner: &super::PipelineInner) {
        self.cmd_buffer.commands.push(C::SetProgram(inner.program));

-        self.state.push_offset_to_uniform.clear();
-        self.state
-            .push_offset_to_uniform
-            .extend(inner.uniforms.iter().cloned());
+        self.state.push_constant_descs = inner.push_constant_descs.clone();

        // rebind textures, if needed
        let mut dirty_textures = 0u32;
@ -729,24 +756,46 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
        &mut self,
        _layout: &super::PipelineLayout,
        _stages: wgt::ShaderStages,
-        start_offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
-        let range = self.cmd_buffer.add_push_constant_data(data);
+        // There is nothing preventing the user from trying to update a single value within
+        // a vector or matrix in the set_push_constant call, as to the user, all of this is
+        // just memory. However OpenGL does not allow parital uniform updates.
+        //
+        // As such, we locally keep a copy of the current state of the push constant memory
+        // block. If the user tries to update a single value, we have the data to update the entirety
+        // of the uniform.
+        let start_words = offset_bytes / 4;
+        let end_words = start_words + data.len() as u32;
+        self.state.current_push_constant_data[start_words as usize..end_words as usize]
+            .copy_from_slice(data);

-        let end = start_offset + data.len() as u32 * 4;
-        let mut offset = start_offset;
-        while offset < end {
-            let uniform = self.state.push_offset_to_uniform[offset as usize / 4].clone();
-            let size = uniform.size;
-            if uniform.location.is_none() {
-                panic!("No uniform for push constant");
+        // We iterate over the uniform list as there may be multiple uniforms that need
+        // updating from the same push constant memory (one for each shader stage).
+        //
+        // Additionally, any statically unused uniform descs will have been removed from this list
+        // by OpenGL, so the uniform list is not contiguous.
+        for uniform in self.state.push_constant_descs.iter().cloned() {
+            let uniform_size_words = uniform.size_bytes / 4;
+            let uniform_start_words = uniform.offset / 4;
+            let uniform_end_words = uniform_start_words + uniform_size_words;
+
+            // Is true if any word within the uniform binding was updated
+            let needs_updating =
+                start_words < uniform_end_words || uniform_start_words <= end_words;
+
+            if needs_updating {
+                let uniform_data = &self.state.current_push_constant_data
+                    [uniform_start_words as usize..uniform_end_words as usize];
+
+                let range = self.cmd_buffer.add_push_constant_data(uniform_data);
+
+                self.cmd_buffer.commands.push(C::SetPushConstants {
+                    uniform,
+                    offset: range.start,
+                });
            }
-            self.cmd_buffer.commands.push(C::SetPushConstants {
-                uniform,
-                offset: range.start + offset,
-            });
-            offset += size;
        }
    }

--- a/wgpu-hal/src/gles/conv.rs
+++ b/wgpu-hal/src/gles/conv.rs
@ -417,108 +417,6 @@ pub(super) fn map_storage_access(access: wgt::StorageTextureAccess) -> u32 {
    }
 }

-pub(super) fn is_sampler(glsl_uniform_type: u32) -> bool {
-    match glsl_uniform_type {
-        glow::INT_SAMPLER_1D
-        | glow::INT_SAMPLER_1D_ARRAY
-        | glow::INT_SAMPLER_2D
-        | glow::INT_SAMPLER_2D_ARRAY
-        | glow::INT_SAMPLER_2D_MULTISAMPLE
-        | glow::INT_SAMPLER_2D_MULTISAMPLE_ARRAY
-        | glow::INT_SAMPLER_2D_RECT
-        | glow::INT_SAMPLER_3D
-        | glow::INT_SAMPLER_CUBE
-        | glow::INT_SAMPLER_CUBE_MAP_ARRAY
-        | glow::UNSIGNED_INT_SAMPLER_1D
-        | glow::UNSIGNED_INT_SAMPLER_1D_ARRAY
-        | glow::UNSIGNED_INT_SAMPLER_2D
-        | glow::UNSIGNED_INT_SAMPLER_2D_ARRAY
-        | glow::UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE
-        | glow::UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY
-        | glow::UNSIGNED_INT_SAMPLER_2D_RECT
-        | glow::UNSIGNED_INT_SAMPLER_3D
-        | glow::UNSIGNED_INT_SAMPLER_CUBE
-        | glow::UNSIGNED_INT_SAMPLER_CUBE_MAP_ARRAY
-        | glow::SAMPLER_1D
-        | glow::SAMPLER_1D_SHADOW
-        | glow::SAMPLER_1D_ARRAY
-        | glow::SAMPLER_1D_ARRAY_SHADOW
-        | glow::SAMPLER_2D
-        | glow::SAMPLER_2D_SHADOW
-        | glow::SAMPLER_2D_ARRAY
-        | glow::SAMPLER_2D_ARRAY_SHADOW
-        | glow::SAMPLER_2D_MULTISAMPLE
-        | glow::SAMPLER_2D_MULTISAMPLE_ARRAY
-        | glow::SAMPLER_2D_RECT
-        | glow::SAMPLER_2D_RECT_SHADOW
-        | glow::SAMPLER_3D
-        | glow::SAMPLER_CUBE
-        | glow::SAMPLER_CUBE_MAP_ARRAY
-        | glow::SAMPLER_CUBE_MAP_ARRAY_SHADOW
-        | glow::SAMPLER_CUBE_SHADOW => true,
-        _ => false,
-    }
-}
-
-pub(super) fn is_image(glsl_uniform_type: u32) -> bool {
-    match glsl_uniform_type {
-        glow::INT_IMAGE_1D
-        | glow::INT_IMAGE_1D_ARRAY
-        | glow::INT_IMAGE_2D
-        | glow::INT_IMAGE_2D_ARRAY
-        | glow::INT_IMAGE_2D_MULTISAMPLE
-        | glow::INT_IMAGE_2D_MULTISAMPLE_ARRAY
-        | glow::INT_IMAGE_2D_RECT
-        | glow::INT_IMAGE_3D
-        | glow::INT_IMAGE_CUBE
-        | glow::INT_IMAGE_CUBE_MAP_ARRAY
-        | glow::UNSIGNED_INT_IMAGE_1D
-        | glow::UNSIGNED_INT_IMAGE_1D_ARRAY
-        | glow::UNSIGNED_INT_IMAGE_2D
-        | glow::UNSIGNED_INT_IMAGE_2D_ARRAY
-        | glow::UNSIGNED_INT_IMAGE_2D_MULTISAMPLE
-        | glow::UNSIGNED_INT_IMAGE_2D_MULTISAMPLE_ARRAY
-        | glow::UNSIGNED_INT_IMAGE_2D_RECT
-        | glow::UNSIGNED_INT_IMAGE_3D
-        | glow::UNSIGNED_INT_IMAGE_CUBE
-        | glow::UNSIGNED_INT_IMAGE_CUBE_MAP_ARRAY
-        | glow::IMAGE_1D
-        | glow::IMAGE_1D_ARRAY
-        | glow::IMAGE_2D
-        | glow::IMAGE_2D_ARRAY
-        | glow::IMAGE_2D_MULTISAMPLE
-        | glow::IMAGE_2D_MULTISAMPLE_ARRAY
-        | glow::IMAGE_2D_RECT
-        | glow::IMAGE_3D
-        | glow::IMAGE_CUBE
-        | glow::IMAGE_CUBE_MAP_ARRAY => true,
-        _ => false,
-    }
-}
-
-pub(super) fn is_atomic_counter(glsl_uniform_type: u32) -> bool {
-    glsl_uniform_type == glow::UNSIGNED_INT_ATOMIC_COUNTER
-}
-
-pub(super) fn is_opaque_type(glsl_uniform_type: u32) -> bool {
-    is_sampler(glsl_uniform_type)
-        || is_image(glsl_uniform_type)
-        || is_atomic_counter(glsl_uniform_type)
-}
-
-pub(super) fn uniform_byte_size(glsl_uniform_type: u32) -> u32 {
-    match glsl_uniform_type {
-        glow::FLOAT | glow::INT => 4,
-        glow::FLOAT_VEC2 | glow::INT_VEC2 => 8,
-        glow::FLOAT_VEC3 | glow::INT_VEC3 => 12,
-        glow::FLOAT_VEC4 | glow::INT_VEC4 => 16,
-        glow::FLOAT_MAT2 => 16,
-        glow::FLOAT_MAT3 => 36,
-        glow::FLOAT_MAT4 => 64,
-        _ => panic!("Unsupported uniform datatype! {glsl_uniform_type:#X}"),
-    }
-}
-
 pub(super) fn is_layered_target(target: u32) -> bool {
    match target {
        glow::TEXTURE_2D | glow::TEXTURE_CUBE_MAP => false,
--- a/wgpu-hal/src/gles/device.rs
+++ b/wgpu-hal/src/gles/device.rs
@ -23,6 +23,7 @@ struct CompilationContext<'a> {
    layout: &'a super::PipelineLayout,
    sampler_map: &'a mut super::SamplerBindMap,
    name_binding_map: &'a mut NameBindingMap,
+    push_constant_items: &'a mut Vec<naga::back::glsl::PushConstantItem>,
    multiview: Option<std::num::NonZeroU32>,
 }

@ -53,7 +54,7 @@ impl CompilationContext<'_> {
                Some(name) => name.clone(),
                None => continue,
            };
-            log::debug!(
+            log::trace!(
                "Rebind buffer: {:?} -> {}, register={:?}, slot={}",
                var.name.as_ref(),
                &name,
@ -101,6 +102,8 @@ impl CompilationContext<'_> {
                naga::ShaderStage::Compute => {}
            }
        }
+
+        *self.push_constant_items = reflection_info.push_constant_items;
    }
 }

@ -279,7 +282,7 @@ impl super::Device {
    unsafe fn create_pipeline<'a>(
        &self,
        gl: &glow::Context,
-        shaders: ArrayVec<ShaderStage<'a>, 3>,
+        shaders: ArrayVec<ShaderStage<'a>, { crate::MAX_CONCURRENT_SHADER_STAGES }>,
        layout: &super::PipelineLayout,
        #[cfg_attr(target_arch = "wasm32", allow(unused))] label: Option<&str>,
        multiview: Option<std::num::NonZeroU32>,
@ -327,7 +330,7 @@ impl super::Device {

    unsafe fn create_program<'a>(
        gl: &glow::Context,
-        shaders: ArrayVec<ShaderStage<'a>, 3>,
+        shaders: ArrayVec<ShaderStage<'a>, { crate::MAX_CONCURRENT_SHADER_STAGES }>,
        layout: &super::PipelineLayout,
        #[cfg_attr(target_arch = "wasm32", allow(unused))] label: Option<&str>,
        multiview: Option<std::num::NonZeroU32>,
@ -348,16 +351,22 @@ impl super::Device {
        }

        let mut name_binding_map = NameBindingMap::default();
+        let mut push_constant_items = ArrayVec::<_, { crate::MAX_CONCURRENT_SHADER_STAGES }>::new();
        let mut sampler_map = [None; super::MAX_TEXTURE_SLOTS];
        let mut has_stages = wgt::ShaderStages::empty();
-        let mut shaders_to_delete = arrayvec::ArrayVec::<_, 3>::new();
+        let mut shaders_to_delete = ArrayVec::<_, { crate::MAX_CONCURRENT_SHADER_STAGES }>::new();

-        for (naga_stage, stage) in shaders {
+        for &(naga_stage, stage) in &shaders {
            has_stages |= map_naga_stage(naga_stage);
+            let pc_item = {
+                push_constant_items.push(Vec::new());
+                push_constant_items.last_mut().unwrap()
+            };
            let context = CompilationContext {
                layout,
                sampler_map: &mut sampler_map,
                name_binding_map: &mut name_binding_map,
+                push_constant_items: pc_item,
                multiview,
            };

@ -409,6 +418,7 @@ impl super::Device {
                match register {
                    super::BindingRegister::UniformBuffers => {
                        let index = unsafe { gl.get_uniform_block_index(program, name) }.unwrap();
+                        log::trace!("\tBinding slot {slot} to block index {index}");
                        unsafe { gl.uniform_block_binding(program, index, slot as _) };
                    }
                    super::BindingRegister::StorageBuffers => {
@ -429,41 +439,38 @@ impl super::Device {
            }
        }

-        let mut uniforms: [super::UniformDesc; super::MAX_PUSH_CONSTANTS] =
-            [None; super::MAX_PUSH_CONSTANTS].map(|_: Option<()>| Default::default());
-        let count = unsafe { gl.get_active_uniforms(program) };
-        let mut offset = 0;
+        let mut uniforms = ArrayVec::new();

-        for uniform in 0..count {
-            let glow::ActiveUniform { utype, name, .. } =
-                unsafe { gl.get_active_uniform(program, uniform) }.unwrap();
+        for (stage_idx, stage_items) in push_constant_items.into_iter().enumerate() {
+            for item in stage_items {
+                let naga_module = &shaders[stage_idx].1.module.naga.module;
+                let type_inner = &naga_module.types[item.ty].inner;

-            if conv::is_opaque_type(utype) {
-                continue;
-            }
+                let location = unsafe { gl.get_uniform_location(program, &item.access_path) };

-            if let Some(location) = unsafe { gl.get_uniform_location(program, &name) } {
-                if uniforms[offset / 4].location.is_some() {
-                    panic!("Offset already occupied")
+                log::trace!(
+                    "push constant item: name={}, ty={:?}, offset={}, location={:?}",
+                    item.access_path,
+                    type_inner,
+                    item.offset,
+                    location,
+                );
+
+                if let Some(location) = location {
+                    uniforms.push(super::PushConstantDesc {
+                        location,
+                        offset: item.offset,
+                        size_bytes: type_inner.size(naga_module.to_ctx()),
+                        ty: type_inner.clone(),
+                    });
                }
-
-                // `size` will always be 1 so we need to guess the real size from the type
-                let uniform_size = conv::uniform_byte_size(utype);
-
-                uniforms[offset / 4] = super::UniformDesc {
-                    location: Some(location),
-                    size: uniform_size,
-                    utype,
-                };
-
-                offset += uniform_size as usize;
            }
        }

        Ok(Arc::new(super::PipelineInner {
            program,
            sampler_map,
-            uniforms,
+            push_constant_descs: uniforms,
        }))
    }
 }
--- a/wgpu-hal/src/gles/mod.rs
+++ b/wgpu-hal/src/gles/mod.rs
@ -108,6 +108,8 @@ const MAX_SAMPLERS: usize = 16;
 const MAX_VERTEX_ATTRIBUTES: usize = 16;
 const ZERO_BUFFER_SIZE: usize = 256 << 10;
 const MAX_PUSH_CONSTANTS: usize = 64;
+// We have to account for each push constant may need to be set for every shader.
+const MAX_PUSH_CONSTANT_COMMANDS: usize = MAX_PUSH_CONSTANTS * crate::MAX_CONCURRENT_SHADER_STAGES;

 impl crate::Api for Api {
    type Instance = Instance;
@ -483,11 +485,12 @@ struct VertexBufferDesc {
    stride: u32,
 }

-#[derive(Clone, Debug, Default)]
-struct UniformDesc {
-    location: Option<glow::UniformLocation>,
-    size: u32,
-    utype: u32,
+#[derive(Clone, Debug)]
+struct PushConstantDesc {
+    location: glow::UniformLocation,
+    ty: naga::TypeInner,
+    offset: u32,
+    size_bytes: u32,
 }

 #[cfg(all(
@ -495,13 +498,13 @@ struct UniformDesc {
    feature = "fragile-send-sync-non-atomic-wasm",
    not(target_feature = "atomics")
 ))]
-unsafe impl Sync for UniformDesc {}
+unsafe impl Sync for PushConstantDesc {}
 #[cfg(all(
    target_arch = "wasm32",
    feature = "fragile-send-sync-non-atomic-wasm",
    not(target_feature = "atomics")
 ))]
-unsafe impl Send for UniformDesc {}
+unsafe impl Send for PushConstantDesc {}

 /// For each texture in the pipeline layout, store the index of the only
 /// sampler (in this layout) that the texture is used with.
@ -510,7 +513,7 @@ type SamplerBindMap = [Option<u8>; MAX_TEXTURE_SLOTS];
 struct PipelineInner {
    program: glow::Program,
    sampler_map: SamplerBindMap,
-    uniforms: [UniformDesc; MAX_PUSH_CONSTANTS],
+    push_constant_descs: ArrayVec<PushConstantDesc, MAX_PUSH_CONSTANT_COMMANDS>,
 }

 #[derive(Clone, Debug)]
@ -882,7 +885,7 @@ enum Command {
    PushDebugGroup(Range<u32>),
    PopDebugGroup,
    SetPushConstants {
-        uniform: UniformDesc,
+        uniform: PushConstantDesc,
        /// Offset from the start of the `data_bytes`
        offset: u32,
    },
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@ -1441,64 +1441,235 @@ impl super::Queue {
                ref uniform,
                offset,
            } => {
-                fn get_data<T>(data: &[u8], offset: u32) -> &[T] {
-                    let raw = &data[(offset as usize)..];
-                    unsafe {
-                        slice::from_raw_parts(
-                            raw.as_ptr() as *const _,
-                            raw.len() / mem::size_of::<T>(),
-                        )
-                    }
+                // T must be POD
+                //
+                // This function is absolutely sketchy and we really should be using bytemuck.
+                unsafe fn get_data<T, const COUNT: usize>(data: &[u8], offset: u32) -> &[T; COUNT] {
+                    let data_required = mem::size_of::<T>() * COUNT;
+
+                    let raw = &data[(offset as usize)..][..data_required];
+
+                    debug_assert_eq!(data_required, raw.len());
+
+                    let slice: &[T] =
+                        unsafe { slice::from_raw_parts(raw.as_ptr() as *const _, COUNT) };
+
+                    slice.try_into().unwrap()
                }

-                let location = uniform.location.as_ref();
+                let location = Some(&uniform.location);

-                match uniform.utype {
-                    glow::FLOAT => {
-                        let data = get_data::<f32>(data_bytes, offset)[0];
+                match uniform.ty {
+                    //
+                    // --- Float 1-4 Component ---
+                    //
+                    naga::TypeInner::Scalar {
+                        kind: naga::ScalarKind::Float,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 1>(data_bytes, offset)[0] };
                        unsafe { gl.uniform_1_f32(location, data) };
                    }
-                    glow::FLOAT_VEC2 => {
-                        let data = get_data::<[f32; 2]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_2_f32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Float,
+                        size: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 2>(data_bytes, offset) };
+                        unsafe { gl.uniform_2_f32_slice(location, data) };
                    }
-                    glow::FLOAT_VEC3 => {
-                        let data = get_data::<[f32; 3]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_3_f32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Float,
+                        size: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 3>(data_bytes, offset) };
+                        unsafe { gl.uniform_3_f32_slice(location, data) };
                    }
-                    glow::FLOAT_VEC4 => {
-                        let data = get_data::<[f32; 4]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_4_f32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Float,
+                        size: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 4>(data_bytes, offset) };
+                        unsafe { gl.uniform_4_f32_slice(location, data) };
                    }
-                    glow::INT => {
-                        let data = get_data::<i32>(data_bytes, offset)[0];
+
+                    //
+                    // --- Int 1-4 Component ---
+                    //
+                    naga::TypeInner::Scalar {
+                        kind: naga::ScalarKind::Sint,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<i32, 1>(data_bytes, offset)[0] };
                        unsafe { gl.uniform_1_i32(location, data) };
                    }
-                    glow::INT_VEC2 => {
-                        let data = get_data::<[i32; 2]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_2_i32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Sint,
+                        size: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<i32, 2>(data_bytes, offset) };
+                        unsafe { gl.uniform_2_i32_slice(location, data) };
                    }
-                    glow::INT_VEC3 => {
-                        let data = get_data::<[i32; 3]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_3_i32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Sint,
+                        size: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<i32, 3>(data_bytes, offset) };
+                        unsafe { gl.uniform_3_i32_slice(location, data) };
                    }
-                    glow::INT_VEC4 => {
-                        let data = get_data::<[i32; 4]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_4_i32_slice(location, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Sint,
+                        size: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<i32, 4>(data_bytes, offset) };
+                        unsafe { gl.uniform_4_i32_slice(location, data) };
                    }
-                    glow::FLOAT_MAT2 => {
-                        let data = get_data::<[f32; 4]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_matrix_2_f32_slice(location, false, &data) };
+
+                    //
+                    // --- Uint 1-4 Component ---
+                    //
+                    naga::TypeInner::Scalar {
+                        kind: naga::ScalarKind::Uint,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<u32, 1>(data_bytes, offset)[0] };
+                        unsafe { gl.uniform_1_u32(location, data) };
                    }
-                    glow::FLOAT_MAT3 => {
-                        let data = get_data::<[f32; 9]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_matrix_3_f32_slice(location, false, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Uint,
+                        size: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<u32, 2>(data_bytes, offset) };
+                        unsafe { gl.uniform_2_u32_slice(location, data) };
                    }
-                    glow::FLOAT_MAT4 => {
-                        let data = get_data::<[f32; 16]>(data_bytes, offset)[0];
-                        unsafe { gl.uniform_matrix_4_f32_slice(location, false, &data) };
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Uint,
+                        size: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<u32, 3>(data_bytes, offset) };
+                        unsafe { gl.uniform_3_u32_slice(location, data) };
                    }
-                    _ => panic!("Unsupported uniform datatype!"),
+                    naga::TypeInner::Vector {
+                        kind: naga::ScalarKind::Uint,
+                        size: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<u32, 4>(data_bytes, offset) };
+                        unsafe { gl.uniform_4_u32_slice(location, data) };
+                    }
+
+                    //
+                    // --- Matrix 2xR ---
+                    //
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Bi,
+                        rows: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 4>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_2_f32_slice(location, false, data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Bi,
+                        rows: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        // repack 2 vec3s into 6 values.
+                        let unpacked_data = unsafe { get_data::<f32, 8>(data_bytes, offset) };
+                        #[rustfmt::skip]
+                        let packed_data = [
+                            unpacked_data[0], unpacked_data[1], unpacked_data[2],
+                            unpacked_data[4], unpacked_data[5], unpacked_data[6],
+                        ];
+                        unsafe { gl.uniform_matrix_2x3_f32_slice(location, false, &packed_data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Bi,
+                        rows: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 8>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_2x4_f32_slice(location, false, data) };
+                    }
+
+                    //
+                    // --- Matrix 3xR ---
+                    //
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Tri,
+                        rows: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 6>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_3x2_f32_slice(location, false, data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Tri,
+                        rows: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        // repack 3 vec3s into 9 values.
+                        let unpacked_data = unsafe { get_data::<f32, 12>(data_bytes, offset) };
+                        #[rustfmt::skip]
+                        let packed_data = [
+                            unpacked_data[0], unpacked_data[1], unpacked_data[2],
+                            unpacked_data[4], unpacked_data[5], unpacked_data[6],
+                            unpacked_data[8], unpacked_data[9], unpacked_data[10],
+                        ];
+                        unsafe { gl.uniform_matrix_3_f32_slice(location, false, &packed_data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Tri,
+                        rows: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 12>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_3x4_f32_slice(location, false, data) };
+                    }
+
+                    //
+                    // --- Matrix 4xR ---
+                    //
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Quad,
+                        rows: naga::VectorSize::Bi,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 8>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_4x2_f32_slice(location, false, data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Quad,
+                        rows: naga::VectorSize::Tri,
+                        width: 4,
+                    } => {
+                        // repack 4 vec3s into 12 values.
+                        let unpacked_data = unsafe { get_data::<f32, 16>(data_bytes, offset) };
+                        #[rustfmt::skip]
+                        let packed_data = [
+                            unpacked_data[0], unpacked_data[1], unpacked_data[2],
+                            unpacked_data[4], unpacked_data[5], unpacked_data[6],
+                            unpacked_data[8], unpacked_data[9], unpacked_data[10],
+                            unpacked_data[12], unpacked_data[13], unpacked_data[14],
+                        ];
+                        unsafe { gl.uniform_matrix_4x3_f32_slice(location, false, &packed_data) };
+                    }
+                    naga::TypeInner::Matrix {
+                        columns: naga::VectorSize::Quad,
+                        rows: naga::VectorSize::Quad,
+                        width: 4,
+                    } => {
+                        let data = unsafe { get_data::<f32, 16>(data_bytes, offset) };
+                        unsafe { gl.uniform_matrix_4_f32_slice(location, false, data) };
+                    }
+                    _ => panic!("Unsupported uniform datatype: {:?}!", uniform.ty),
                }
            }
        }
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@ -97,6 +97,9 @@ use bitflags::bitflags;
 use thiserror::Error;
 use wgt::{WasmNotSend, WasmNotSync};

+// - Vertex + Fragment
+// - Compute
+pub const MAX_CONCURRENT_SHADER_STAGES: usize = 2;
 pub const MAX_ANISOTROPY: u8 = 16;
 pub const MAX_BIND_GROUPS: usize = 8;
 pub const MAX_VERTEX_BUFFERS: usize = 16;
@ -500,11 +503,19 @@ pub trait CommandEncoder<A: Api>: WasmNotSend + WasmNotSync + fmt::Debug {
        dynamic_offsets: &[wgt::DynamicOffset],
    );

+    /// Sets a range in push constant data.
+    ///
+    /// IMPORTANT: while the data is passed as words, the offset is in bytes!
+    ///
+    /// # Safety
+    ///
+    /// - `offset_bytes` must be a multiple of 4.
+    /// - The range of push constants written must be valid for the pipeline layout at draw time.
    unsafe fn set_push_constants(
        &mut self,
        layout: &A::PipelineLayout,
        stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    );

--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@ -798,17 +798,17 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
        &mut self,
        layout: &super::PipelineLayout,
        stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
        let state_pc = &mut self.state.push_constants;
        if state_pc.len() < layout.total_push_constants as usize {
            state_pc.resize(layout.total_push_constants as usize, 0);
        }
-        assert_eq!(offset as usize % WORD_SIZE, 0);
+        debug_assert_eq!(offset_bytes as usize % WORD_SIZE, 0);

-        let offset = offset as usize / WORD_SIZE;
-        state_pc[offset..offset + data.len()].copy_from_slice(data);
+        let offset_words = offset_bytes as usize / WORD_SIZE;
+        state_pc[offset_words..offset_words + data.len()].copy_from_slice(data);

        if stages.contains(wgt::ShaderStages::COMPUTE) {
            self.state.compute.as_ref().unwrap().set_bytes(
--- a/wgpu-hal/src/vulkan/command.rs
+++ b/wgpu-hal/src/vulkan/command.rs
@ -600,7 +600,7 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
        &mut self,
        layout: &super::PipelineLayout,
        stages: wgt::ShaderStages,
-        offset: u32,
+        offset_bytes: u32,
        data: &[u32],
    ) {
        unsafe {
@ -608,7 +608,7 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                self.active,
                layout.raw,
                conv::map_shader_stage(stages),
-                offset,
+                offset_bytes,
                slice::from_raw_parts(data.as_ptr() as _, data.len() * 4),
            )
        };
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@ -1588,7 +1588,7 @@ impl crate::Device<super::Api> for super::Device {
            multiview: desc.multiview,
            ..Default::default()
        };
-        let mut stages = ArrayVec::<_, 2>::new();
+        let mut stages = ArrayVec::<_, { crate::MAX_CONCURRENT_SHADER_STAGES }>::new();
        let mut vertex_buffers = Vec::with_capacity(desc.vertex_buffers.len());
        let mut vertex_attributes = Vec::new();

--- a/wgpu/src/backend/direct.rs
+++ b/wgpu/src/backend/direct.rs
@ -1217,7 +1217,7 @@ impl crate::Context for Context {
        if let Some(cause) = error {
            if let wgc::pipeline::CreateRenderPipelineError::Internal { stage, ref error } = cause {
                log::error!("Shader translation error for stage {:?}: {}", stage, error);
-                log::error!("Please report it to https://github.com/gfx-rs/naga");
+                log::error!("Please report it to https://github.com/gfx-rs/wgpu");
            }
            self.handle_error(
                &device_data.error_sink,
@ -1262,12 +1262,12 @@ impl crate::Context for Context {
        ));
        if let Some(cause) = error {
            if let wgc::pipeline::CreateComputePipelineError::Internal(ref error) = cause {
-                log::warn!(
+                log::error!(
                    "Shader translation error for stage {:?}: {}",
                    wgt::ShaderStages::COMPUTE,
                    error
                );
-                log::warn!("Please report it to https://github.com/gfx-rs/naga");
+                log::error!("Please report it to https://github.com/gfx-rs/wgpu");
            }
            self.handle_error(
                &device_data.error_sink,