Remove vertex_pulling_transfrom from PipelineCompilationOptions.

This option was only evaluated for Metal backends, and now it's required there so the option is going away. It is still configurable for tests via the PipelineOptions struct, deserialized from .ron files. This also fixes some type problems with the unpack functions in writer.rs. Metal << operator extends operand to int-sized, which then has to be cast back down to the real size before as_type bit conversion. The math for the snorm values is corrected, in some cases using the metal unpack_snorm2x16_to_float function because we can't directly cast a bit-shifted ushort value to half.
2024-11-21 22:33:49 +00:00 · 2024-06-04 10:11:03 -07:00 · 2024-06-04 10:11:03 -07:00 · 6cd387412f
commit 6cd387412f
parent bc7622f641
17 changed files with 756 additions and 105 deletions
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@ -112,7 +112,6 @@ pub fn op_webgpu_create_compute_pipeline(
            entry_point: compute.entry_point.map(Cow::from),
            constants: Cow::Owned(compute.constants.unwrap_or_default()),
            zero_initialize_workgroup_memory: true,
-            vertex_pulling_transform: false,
        },
        cache: None,
    };
@ -348,7 +347,6 @@ pub fn op_webgpu_create_render_pipeline(
                constants: Cow::Owned(fragment.constants.unwrap_or_default()),
                // Required to be true for WebGPU
                zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
            },
            targets: Cow::Owned(fragment.targets),
        })
@ -374,7 +372,6 @@ pub fn op_webgpu_create_render_pipeline(
                constants: Cow::Owned(args.vertex.constants.unwrap_or_default()),
                // Required to be true for WebGPU
                zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
            },
            buffers: Cow::Owned(vertex_buffers),
        },
--- a/naga/CHANGELOG.md
+++ b/naga/CHANGELOG.md
@ -81,6 +81,7 @@ For changelogs after v0.14, see [the wgpu changelog](../CHANGELOG.md).
 - Make varyings' struct members unique. ([#2521](https://github.com/gfx-rs/naga/pull/2521)) **@evahop**
 - Add experimental vertex pulling transform flag. ([#5254](https://github.com/gfx-rs/wgpu/pull/5254)) **@bradwerth**
 - Fixup some generated MSL for vertex buffer unpack functions. ([#5829](https://github.com/gfx-rs/wgpu/pull/5829)) **@bradwerth**
+- Make vertex pulling transform on by default. ([#5773](https://github.com/gfx-rs/wgpu/pull/5773)) **@bradwerth**

 #### GLSL-OUT

--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@ -354,7 +354,9 @@ pub struct PipelineOptions {
    /// to receive the vertex buffers, lengths, and vertex id as args,
    /// and bounds-check the vertex id and use the index into the
    /// vertex buffers to access attributes, rather than using Metal's
-    /// [[stage-in]] assembled attribute data.
+    /// [[stage-in]] assembled attribute data. This is true by default,
+    /// but remains configurable for use by tests via deserialization
+    /// of this struct. There is no user-facing way to set this value.
    pub vertex_pulling_transform: bool,

    /// vertex_buffer_mappings are used during shader translation to
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@ -3953,8 +3953,8 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::float2((float(b0) - 128.0f) / 255.0f, \
-                                            (float(b1) - 128.0f) / 255.0f);",
+                    "{}return metal::float2(metal::max(-1.0f, as_type<char>(b0) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b1) / 127.0f));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -3971,10 +3971,10 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::float4((float(b0) - 128.0f) / 255.0f, \
-                                            (float(b1) - 128.0f) / 255.0f, \
-                                            (float(b2) - 128.0f) / 255.0f, \
-                                            (float(b3) - 128.0f) / 255.0f);",
+                    "{}return metal::float4(metal::max(-1.0f, as_type<char>(b0) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b1) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b2) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b3) / 127.0f));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4033,8 +4033,8 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::int2(as_type<metal::short>(b1 << 8 | b0), \
-                                          as_type<metal::short>(b3 << 8 | b2));",
+                    "{}return metal::int2(as_type<short>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<short>(metal::ushort(b3 << 8 | b2)));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4055,10 +4055,10 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::int4(as_type<metal::short>(b1 << 8 | b0), \
-                                          as_type<metal::short>(b3 << 8 | b2), \
-                                          as_type<metal::short>(b5 << 8 | b4), \
-                                          as_type<metal::short>(b7 << 8 | b6));",
+                    "{}return metal::int4(as_type<short>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<short>(metal::ushort(b3 << 8 | b2)), \
+                                          as_type<short>(metal::ushort(b5 << 8 | b4)), \
+                                          as_type<short>(metal::ushort(b7 << 8 | b6)));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4117,8 +4117,7 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::float2((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
-                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f);",
+                    "{}return metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2);",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4139,10 +4138,8 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::float4((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
-                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f, \
-                                            (float(b5 << 8 | b4) - 32767.0f) / 65535.0f, \
-                                            (float(b7 << 8 | b6) - 32767.0f) / 65535.0f);",
+                    "{}return metal::float4(metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2), \
+                                            metal::unpack_snorm2x16_to_float(b5 << 24 | b4 << 16 | b7 << 8 | b6));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4159,8 +4156,8 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::float2(as_type<metal::half>(b1 << 8 | b0), \
-                                            as_type<metal::half>(b3 << 8 | b2));",
+                    "{}return metal::float2(as_type<half>(metal::ushort(b1 << 8 | b0)), \
+                                            as_type<half>(metal::ushort(b3 << 8 | b2)));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4170,7 +4167,7 @@ impl<W: Write> Writer<W> {
                let name = self.namer.call("unpackFloat16x4");
                writeln!(
                    self.out,
-                    "metal::int4 {name}(metal::ushort b0, \
+                    "metal::float4 {name}(metal::ushort b0, \
                                        metal::ushort b1, \
                                        metal::ushort b2, \
                                        metal::ushort b3, \
@ -4181,10 +4178,10 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return metal::int4(as_type<metal::half>(b1 << 8 | b0), \
-                                          as_type<metal::half>(b3 << 8 | b2), \
-                                          as_type<metal::half>(b5 << 8 | b4), \
-                                          as_type<metal::half>(b7 << 8 | b6));",
+                    "{}return metal::float4(as_type<half>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<half>(metal::ushort(b3 << 8 | b2)), \
+                                          as_type<half>(metal::ushort(b5 << 8 | b4)), \
+                                          as_type<half>(metal::ushort(b7 << 8 | b6)));",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
@ -4390,10 +4387,10 @@ impl<W: Write> Writer<W> {
                let name = self.namer.call("unpackSint32");
                writeln!(
                    self.out,
-                    "metal::int {name}(uint b0, \
-                                       uint b1, \
-                                       uint b2, \
-                                       uint b3) {{"
+                    "int {name}(uint b0, \
+                                uint b1, \
+                                uint b2, \
+                                uint b3) {{"
                )?;
                writeln!(
                    self.out,
@ -4495,7 +4492,18 @@ impl<W: Write> Writer<W> {
                )?;
                writeln!(
                    self.out,
-                    "{}return unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    // The following is correct for RGBA packing, but our format seems to
+                    // match ABGR, which can be fed into the Metal builtin function
+                    // unpack_unorm10a2_to_float.
+                    /*
+                    "{}uint v = (b3 << 24 | b2 << 16 | b1 << 8 | b0); \
+                       uint r = (v & 0xFFC00000) >> 22; \
+                       uint g = (v & 0x003FF000) >> 12; \
+                       uint b = (v & 0x00000FFC) >> 2; \
+                       uint a = (v & 0x00000003); \
+                       return metal::float4(float(r) / 1023.0f, float(g) / 1023.0f, float(b) / 1023.0f, float(a) / 3.0f);",
+                    */
+                    "{}return metal::unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
                    back::INDENT
                )?;
                writeln!(self.out, "}}")?;
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@ -42,6 +42,7 @@ mod subgroup_operations;
 mod texture_bounds;
 mod texture_view_creation;
 mod transfer;
+mod vertex_formats;
 mod vertex_indices;
 mod write_texture;
 mod zero_init_texture_after_discard;
--- a/tests/tests/vertex_formats/draw.vert.wgsl
+++ b/tests/tests/vertex_formats/draw.vert.wgsl
@ -0,0 +1,316 @@
+@group(0) @binding(0)
+var<storage, read_write> checksums: array<f32>;
+
+const index_uint = 0u;
+const index_sint = 1u;
+const index_unorm = 2u;
+const index_snorm = 3u;
+const index_float16 = 4u;
+const index_float32 = 5u;
+
+fn init_checksums() {
+  checksums[index_uint] = 0.0;
+  checksums[index_sint] = 0.0;
+  checksums[index_unorm] = 0.0;
+  checksums[index_snorm] = 0.0;
+  checksums[index_float16] = 0.0;
+  checksums[index_float32] = 0.0;
+}
+
+// Break down the 31 vertex formats specified at
+// https://gpuweb.github.io/gpuweb/#vertex-formats into blocks
+// of 8, to keep under the limits of max locations. Each
+// AttributeBlockX structure will get a corresponding
+// vertex_block_X function to process its attributes into
+// values written to the checksums buffer.
+
+struct AttributeBlock0 {
+  // 4-byte-aligned unorm formats
+  @location(0) unorm8x4: vec4<f32>,
+  @location(1) unorm16x2: vec2<f32>,
+  @location(2) unorm16x4: vec4<f32>,
+
+  // 4-byte-aligned snorm formats
+  @location(3) snorm8x4: vec4<f32>,
+  @location(4) snorm16x2: vec2<f32>,
+  @location(5) snorm16x4: vec4<f32>,
+
+  // 2-byte-aligned formats
+  @location(6) unorm8x2: vec2<f32>,
+  @location(7) snorm8x2: vec2<f32>,
+}
+
+@vertex
+fn vertex_block_0(v_in: AttributeBlock0) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all unorm into one checksum value.
+  var all_unorm: f32 = 0.0;
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.y);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.w);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.y);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.w);
+
+  checksums[index_unorm] = f32(all_unorm);
+
+  // Accumulate all snorm into one checksum value.
+  var all_snorm: f32 = 0.0;
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.y);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.y);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.z);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.w);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.y);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.y);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.z);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.w);
+
+  checksums[index_snorm] = f32(all_snorm);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock1 {
+  // 4-byte-aligned uint formats
+  @location(0) uint8x4: vec4<u32>,
+  @location(1) uint16x2: vec2<u32>,
+  @location(2) uint16x4: vec4<u32>,
+
+  // 4-byte-aligned sint formats
+  @location(3) sint8x4: vec4<i32>,
+  @location(4) sint16x2: vec2<i32>,
+  @location(5) sint16x4: vec4<i32>,
+
+  // 2-byte-aligned formats
+  @location(6) uint8x2: vec2<u32>,
+  @location(7) sint8x2: vec2<i32>,
+}
+
+@vertex
+fn vertex_block_1(v_in: AttributeBlock1) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all uint into one checksum value.
+  var all_uint: u32 = 0;
+  all_uint = accumulate_uint(all_uint, v_in.uint8x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.w);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint16x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.w);
+
+  checksums[index_uint] = f32(all_uint);
+
+  // Accumulate all sint into one checksum value.
+  var all_sint: i32 = 0;
+  all_sint = accumulate_sint(all_sint, v_in.sint8x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.w);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint16x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.w);
+
+  checksums[index_sint] = f32(all_sint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock2 {
+  @location(0) uint32: u32,
+  @location(1) uint32x2: vec2<u32>,
+  @location(2) uint32x3: vec3<u32>,
+  @location(3) uint32x4: vec4<u32>,
+}
+
+@vertex
+fn vertex_block_2(v_in: AttributeBlock2) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all uint into one checksum value.
+  var all_uint: u32 = 0;
+  all_uint = accumulate_uint(all_uint, v_in.uint32);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.z);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.w);
+
+  checksums[index_uint] = f32(all_uint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock3 {
+  @location(0) sint32: i32,
+  @location(1) sint32x2: vec2<i32>,
+  @location(2) sint32x3: vec3<i32>,
+  @location(3) sint32x4: vec4<i32>,
+}
+
+@vertex
+fn vertex_block_3(v_in: AttributeBlock3) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all sint into one checksum value.
+  var all_sint: i32 = 0;
+  all_sint = accumulate_sint(all_sint, v_in.sint32);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.z);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.w);
+
+  checksums[index_sint] = f32(all_sint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock4{
+  @location(0) float32: f32,
+  @location(1) float32x2: vec2<f32>,
+  @location(2) float32x3: vec3<f32>,
+  @location(3) float32x4: vec4<f32>,
+  @location(4) float16x2: vec2<f32>,
+  @location(5) float16x4: vec4<f32>,
+}
+
+@vertex
+fn vertex_block_4(v_in: AttributeBlock4) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all float32 into one checksum value.
+  var all_float32: f32 = 0.0;
+  all_float32 = accumulate_float32(all_float32, v_in.float32);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x2.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x2.y);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.y);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.z);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.y);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.z);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.w);
+
+  checksums[index_float32] = f32(all_float32);
+
+  // Accumulate all float16 into one checksum value.
+  var all_float16: f32 = 0.0;
+  all_float16 = accumulate_float16(all_float16, v_in.float16x2.x);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x2.y);
+
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.x);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.y);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.z);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.w);
+
+  checksums[index_float16] = f32(all_float16);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock5{
+  @location(0) unorm10_10_10_2: vec4<f32>,
+}
+
+@vertex
+fn vertex_block_5(v_in: AttributeBlock5) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all unorm into one checksum value.
+  var all_unorm: f32 = 0.0;
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.w);
+
+  checksums[index_unorm] = f32(all_unorm);
+
+  return vec4(0.0);
+}
+
+fn accumulate_uint(accum: u32, val: u32) -> u32 {
+  return accum + val;
+}
+
+fn accumulate_sint(accum: i32, val: i32) -> i32 {
+  return accum + val;
+}
+
+fn accumulate_unorm(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_snorm(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_float16(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_float32(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+@fragment
+fn fragment_main() -> @location(0) vec4<f32> {
+    return vec4<f32>(0.0);
+}
--- a/tests/tests/vertex_formats/mod.rs
+++ b/tests/tests/vertex_formats/mod.rs
@ -0,0 +1,388 @@
+//! Tests that vertex formats pass through to vertex shaders accurately.
+
+use std::num::NonZeroU64;
+
+use wgpu::util::{BufferInitDescriptor, DeviceExt};
+
+use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext};
+
+#[derive(Debug, Copy, Clone)]
+enum TestCase {
+    UnormsAndSnorms,
+    UintsAndSintsSmall,
+    UintsBig,
+    SintsBig,
+    Floats,
+    Unorm1010102,
+}
+
+struct Test<'a> {
+    case: TestCase,
+    entry_point: &'a str,
+    attributes: &'a [wgt::VertexAttribute],
+    input: &'a [u8],
+    checksums: &'a [f32],
+}
+
+async fn vertex_formats_all(ctx: TestingContext) {
+    let attributes_block_0 = &wgpu::vertex_attr_array![
+        0 => Unorm8x4,
+        1 => Unorm16x2,
+        2 => Unorm16x4,
+        3 => Snorm8x4,
+        4 => Snorm16x2,
+        5 => Snorm16x4,
+        6 => Unorm8x2,
+        7 => Snorm8x2,
+    ];
+
+    let attributes_block_1 = &wgpu::vertex_attr_array![
+        0 => Uint8x4,
+        1 => Uint16x2,
+        2 => Uint16x4,
+        3 => Sint8x4,
+        4 => Sint16x2,
+        5 => Sint16x4,
+        6 => Uint8x2,
+        7 => Sint8x2,
+    ];
+
+    let attributes_block_2 = &wgpu::vertex_attr_array![
+        0 => Uint32,
+        1 => Uint32x2,
+        2 => Uint32x3,
+        3 => Uint32x4,
+    ];
+
+    let attributes_block_3 = &wgpu::vertex_attr_array![
+        0 => Sint32,
+        1 => Sint32x2,
+        2 => Sint32x3,
+        3 => Sint32x4,
+    ];
+
+    let attributes_block_4 = &wgpu::vertex_attr_array![
+        0 => Float32,
+        1 => Float32x2,
+        2 => Float32x3,
+        3 => Float32x4,
+        4 => Float16x2,
+        5 => Float16x4,
+    ];
+
+    let tests = vec![
+        Test {
+            case: TestCase::UnormsAndSnorms,
+            entry_point: "vertex_block_0",
+            attributes: attributes_block_0,
+            input: &[
+                128u8, 128u8, 128u8, 128u8, // Unorm8x4 (0.5, 0.5, 0.5, 0.5)
+                0u8, 128u8, 0u8, 128u8, // Unorm16x2 (0.5, 0.5)
+                0u8, 64u8, 0u8, 64u8, 0u8, 64u8, 0u8,
+                64u8, // Unorm16x4 (0.25, 0.25, 0.25, 0.25)
+                127u8, 127u8, 127u8, 127u8, // Snorm8x4 (1, 1, 1, 1)
+                0u8, 128u8, 0u8, 128u8, // Snorm16x2 (-1, -1)
+                255u8, 127u8, 255u8, 127u8, 255u8, 127u8, 255u8,
+                127u8, // Snorm16x4 (1, 1, 1, 1)
+                255u8, 255u8, // Unorm8x2 (1, 1)
+                128u8, 128u8, // Snorm8x2 (-1, -1)
+            ],
+            checksums: &[0.0, 0.0, 6.0, 4.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::UintsAndSintsSmall,
+            entry_point: "vertex_block_1",
+            attributes: attributes_block_1,
+            input: &[
+                4u8, 8u8, 16u8, 32u8, // Uint8x4 (4, 8, 16, 32)
+                64u8, 0u8, 128u8, 0u8, // Uint16x2 (64, 128)
+                0u8, 1u8, 0u8, 2u8, 0u8, 4u8, 0u8, 8u8, // Uint16x4 (256, 512, 1024, 2048)
+                127u8, 127u8, 2u8, 0u8, // Sint8x4 (127, 127, 2, 0)
+                255u8, 255u8, 1u8, 0u8, // Sint16x2 (-1, 1)
+                128u8, 255u8, 128u8, 255u8, 0u8, 1u8, 240u8,
+                255u8, // Sint16x4 (-128, -128, 256, -16)
+                1u8, 2u8, // Uint8x2 (1, 2)
+                128u8, 128u8, // Sint8x2 (-128, -128)
+            ],
+            checksums: &[4095.0, -16.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::UintsBig,
+            entry_point: "vertex_block_2",
+            attributes: attributes_block_2,
+            input: &[
+                1u8, 0u8, 0u8, 0u8, // Uint32x2 (1)
+                2u8, 0u8, 0u8, 0u8, 4u8, 0u8, 0u8, 0u8, // Uint32x2 (2, 4)
+                8u8, 0u8, 0u8, 0u8, 16u8, 0u8, 0u8, 0u8, 32u8, 0u8, 0u8,
+                0u8, // Uint32x3 (8, 16, 32)
+                64u8, 0u8, 0u8, 0u8, 128u8, 0u8, 0u8, 0u8, 0u8, 1u8, 0u8, 0u8, 0u8, 2u8, 0u8,
+                0u8, // Uint32x4 (64, 128, 256, 512)
+            ],
+            checksums: &[1023.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::SintsBig,
+            entry_point: "vertex_block_3",
+            attributes: attributes_block_3,
+            input: &[
+                128u8, 255u8, 255u8, 255u8, // Sint32 (-128)
+                120u8, 0u8, 0u8, 0u8, 8u8, 0u8, 0u8, 0u8, // Sint32x2 (120, 8)
+                252u8, 255u8, 255u8, 255u8, 2u8, 0u8, 0u8, 0u8, 2u8, 0u8, 0u8,
+                0u8, // Sint32x3 (-4, 2, 2)
+                24u8, 252u8, 255u8, 255u8, 88u8, 2u8, 0u8, 0u8, 44u8, 1u8, 0u8, 0u8, 99u8, 0u8,
+                0u8, 0u8, // Sint32x4 (-1000, 600, 300, 99)
+            ],
+            checksums: &[0.0, -1.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::Floats,
+            entry_point: "vertex_block_4",
+            attributes: attributes_block_4,
+            input: &[
+                0u8, 0u8, 0u8, 63u8, // Float32 (0.5)
+                0u8, 0u8, 0u8, 191u8, 0u8, 0u8, 128u8, 64u8, // Float32x2 (-0.5, 4.0)
+                0u8, 0u8, 0u8, 192u8, 0u8, 0u8, 204u8, 194u8, 0u8, 0u8, 200u8,
+                66u8, // Float32x3 (-2.0, -102.0, 100.0)
+                0u8, 0u8, 92u8, 66u8, 0u8, 0u8, 72u8, 194u8, 0u8, 0u8, 32u8, 65u8, 0u8, 0u8, 128u8,
+                63u8, // Float32x4 (55.0, -50.0, 10.0, 1.0)
+                0u8, 60u8, 72u8, 53u8, // Float16x2 (1.0, 0.33)
+                72u8, 57u8, 0u8, 192u8, 0u8, 188u8, 0u8,
+                184u8, // Float16x4 (0.66, -2.0, -1.0, -0.5)
+            ],
+            checksums: &[0.0, 0.0, 0.0, 0.0, -1.5, 16.0],
+        },
+    ];
+
+    vertex_formats_common(ctx, &tests).await;
+}
+
+async fn vertex_formats_10_10_10_2(ctx: TestingContext) {
+    let attributes_block_5 = &wgpu::vertex_attr_array![
+        0 => Unorm10_10_10_2,
+    ];
+
+    let tests = vec![Test {
+        case: TestCase::Unorm1010102,
+        entry_point: "vertex_block_5",
+        attributes: attributes_block_5,
+        input: &[
+            // We are aiming for rgba of (0.5, 0.5, 0.5, 0.66)
+            // Packing   AA BB BBBB BBBB GGGG GGGG GG RR RRRR RRRR
+            // Binary    10 10 0000 0000 1000 0000 00 10 0000 0000
+            // Hex               A0        08         02        00
+            // Decimal          160         8          2         0
+            // unorm   0.66          0.5          0.5          0.5 = 2.16
+            0u8, 2u8, 8u8, 160u8, // Unorm10_10_10_2
+        ],
+        checksums: &[0.0, 0.0, 2.16, 0.0, 0.0, 0.0],
+    }];
+
+    vertex_formats_common(ctx, &tests).await;
+}
+
+async fn vertex_formats_common(ctx: TestingContext, tests: &[Test<'_>]) {
+    let shader = ctx
+        .device
+        .create_shader_module(wgpu::include_wgsl!("draw.vert.wgsl"));
+
+    let bgl = ctx
+        .device
+        .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &[wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: NonZeroU64::new(4),
+                },
+                visibility: wgpu::ShaderStages::VERTEX,
+                count: None,
+            }],
+        });
+
+    let ppl = ctx
+        .device
+        .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&bgl],
+            push_constant_ranges: &[],
+        });
+
+    let dummy = ctx
+        .device
+        .create_texture_with_data(
+            &ctx.queue,
+            &wgpu::TextureDescriptor {
+                label: Some("dummy"),
+                size: wgpu::Extent3d {
+                    width: 1,
+                    height: 1,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8Unorm,
+                usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_DST,
+                view_formats: &[],
+            },
+            wgpu::util::TextureDataOrder::LayerMajor,
+            &[0, 0, 0, 1],
+        )
+        .create_view(&wgpu::TextureViewDescriptor::default());
+
+    let mut failed = false;
+    for test in tests {
+        let buffer_input = ctx.device.create_buffer_init(&BufferInitDescriptor {
+            label: None,
+            contents: bytemuck::cast_slice(test.input),
+            usage: wgpu::BufferUsages::VERTEX,
+        });
+
+        let pipeline_desc = wgpu::RenderPipelineDescriptor {
+            label: None,
+            layout: Some(&ppl),
+            vertex: wgpu::VertexState {
+                buffers: &[wgpu::VertexBufferLayout {
+                    array_stride: 0, // Calculate, please!
+                    step_mode: wgpu::VertexStepMode::Vertex,
+                    attributes: test.attributes,
+                }],
+                module: &shader,
+                entry_point: test.entry_point,
+                compilation_options: Default::default(),
+            },
+            primitive: wgpu::PrimitiveState::default(),
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            fragment: Some(wgpu::FragmentState {
+                module: &shader,
+                entry_point: "fragment_main",
+                compilation_options: Default::default(),
+                targets: &[Some(wgpu::ColorTargetState {
+                    format: wgpu::TextureFormat::Rgba8Unorm,
+                    blend: None,
+                    write_mask: wgpu::ColorWrites::ALL,
+                })],
+            }),
+            multiview: None,
+            cache: None,
+        };
+
+        let pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
+
+        let expected = test.checksums;
+        let buffer_size = (std::mem::size_of_val(&expected[0]) * expected.len()) as u64;
+        let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
+
+        let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+
+        let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &bgl,
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: gpu_buffer.as_entire_binding(),
+            }],
+        });
+
+        let mut encoder1 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        let mut rpass = encoder1.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                ops: wgpu::Operations::default(),
+                resolve_target: None,
+                view: &dummy,
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+
+        rpass.set_vertex_buffer(0, buffer_input.slice(..));
+        rpass.set_pipeline(&pipeline);
+        rpass.set_bind_group(0, &bg, &[]);
+
+        // Draw three vertices and no instance, which is enough to generate the
+        // checksums.
+        rpass.draw(0..3, 0..1);
+
+        drop(rpass);
+
+        let mut encoder2 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        encoder2.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, buffer_size);
+
+        // See https://github.com/gfx-rs/wgpu/issues/4732 for why this is split between two submissions
+        // with a hard wait in between.
+        ctx.queue.submit([encoder1.finish()]);
+        ctx.async_poll(wgpu::Maintain::wait())
+            .await
+            .panic_on_timeout();
+        ctx.queue.submit([encoder2.finish()]);
+        let slice = cpu_buffer.slice(..);
+        slice.map_async(wgpu::MapMode::Read, |_| ());
+        ctx.async_poll(wgpu::Maintain::wait())
+            .await
+            .panic_on_timeout();
+        let data: Vec<f32> = bytemuck::cast_slice(&slice.get_mapped_range()).to_vec();
+
+        let case_name = format!("Case {:?}", test.case);
+
+        // Calculate the difference between data and expected. Since the data is
+        // a bunch of float checksums, we allow a fairly large epsilon, which helps
+        // with the accumulation of float rounding errors.
+        const EPSILON: f32 = 0.01;
+
+        let mut deltas = data.iter().zip(expected.iter()).map(|(d, e)| (d - e).abs());
+        if deltas.any(|x| x > EPSILON) {
+            eprintln!(
+                "Failed: Got: {:?} Expected: {:?} - {case_name}",
+                data, expected,
+            );
+            failed = true;
+            continue;
+        }
+
+        eprintln!("Passed: {case_name}");
+    }
+
+    assert!(!failed);
+}
+
+#[gpu_test]
+static VERTEX_FORMATS_ALL: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .test_features_limits()
+            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
+    )
+    .run_async(vertex_formats_all);
+
+// Some backends can handle Unorm-10-10-2, but GL backends seem to throw this error:
+// Validation Error: GL_INVALID_ENUM in glVertexAttribFormat(type = GL_UNSIGNED_INT_10_10_10_2)
+#[gpu_test]
+static VERTEX_FORMATS_10_10_10_2: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .expect_fail(FailureCase::backend(wgpu::Backends::GL))
+            .test_features_limits()
+            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
+    )
+    .run_async(vertex_formats_10_10_10_2);
--- a/tests/tests/vertex_indices/mod.rs
+++ b/tests/tests/vertex_indices/mod.rs
@ -166,7 +166,6 @@ struct Test {
    id_source: IdSource,
    draw_call_kind: DrawCallKind,
    encoder_kind: EncoderKind,
-    vertex_pulling_transform: bool,
 }

 impl Test {
@ -280,15 +279,6 @@ async fn vertex_index_common(ctx: TestingContext) {
        cache: None,
    };
    let builtin_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = true;
-    let builtin_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = false;

    pipeline_desc.vertex.entry_point = "vs_main_buffers";
    pipeline_desc.vertex.buffers = &[
@ -304,15 +294,6 @@ async fn vertex_index_common(ctx: TestingContext) {
        },
    ];
    let buffer_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = true;
-    let buffer_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = false;

    let dummy = ctx
        .device
@ -341,18 +322,12 @@ async fn vertex_index_common(ctx: TestingContext) {
        .cartesian_product(IdSource::iter())
        .cartesian_product(DrawCallKind::iter())
        .cartesian_product(EncoderKind::iter())
-        .cartesian_product([false, true])
-        .map(
-            |((((case, id_source), draw_call_kind), encoder_kind), vertex_pulling_transform)| {
-                Test {
-                    case,
-                    id_source,
-                    draw_call_kind,
-                    encoder_kind,
-                    vertex_pulling_transform,
-                }
-            },
-        )
+        .map(|(((case, id_source), draw_call_kind), encoder_kind)| Test {
+            case,
+            id_source,
+            draw_call_kind,
+            encoder_kind,
+        })
        .collect::<Vec<_>>();

    let features = ctx.adapter.features();
@ -360,20 +335,8 @@ async fn vertex_index_common(ctx: TestingContext) {
    let mut failed = false;
    for test in tests {
        let pipeline = match test.id_source {
-            IdSource::Buffers => {
-                if test.vertex_pulling_transform {
-                    &buffer_pipeline_vpt
-                } else {
-                    &buffer_pipeline
-                }
-            }
-            IdSource::Builtins => {
-                if test.vertex_pulling_transform {
-                    &builtin_pipeline_vpt
-                } else {
-                    &builtin_pipeline
-                }
-            }
+            IdSource::Buffers => &buffer_pipeline,
+            IdSource::Builtins => &builtin_pipeline,
        };

        let expected = test.expectation(&ctx);
--- a/wgpu-core/src/device/global.rs
+++ b/wgpu-core/src/device/global.rs
@ -1484,7 +1484,6 @@ impl Global {
                        .vertex
                        .stage
                        .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: desc.vertex.stage.vertex_pulling_transform,
                };
                ResolvedVertexState {
                    stage,
@ -1511,7 +1510,6 @@ impl Global {
                        .vertex
                        .stage
                        .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: state.stage.vertex_pulling_transform,
                };
                Some(ResolvedFragmentState {
                    stage,
@ -1720,7 +1718,6 @@ impl Global {
                entry_point: desc.stage.entry_point.clone(),
                constants: desc.stage.constants.clone(),
                zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: desc.stage.vertex_pulling_transform,
            };

            let desc = ResolvedComputePipelineDescriptor {
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@ -2694,7 +2694,6 @@ impl<A: HalApi> Device<A> {
                entry_point: final_entry_point_name.as_ref(),
                constants: desc.stage.constants.as_ref(),
                zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: false,
            },
            cache: cache.as_ref().and_then(|it| it.raw.as_ref()),
        };
@ -3114,7 +3113,6 @@ impl<A: HalApi> Device<A> {
                entry_point: &vertex_entry_point_name,
                constants: stage_desc.constants.as_ref(),
                zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: stage_desc.vertex_pulling_transform,
            }
        };

@ -3171,7 +3169,6 @@ impl<A: HalApi> Device<A> {
                    zero_initialize_workgroup_memory: fragment_state
                        .stage
                        .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: false,
                })
            }
            None => None,
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@ -147,8 +147,6 @@ pub struct ProgrammableStageDescriptor<'a> {
    /// This is required by the WebGPU spec, but may have overhead which can be avoided
    /// for cross-platform applications
    pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }

 /// Describes a programmable pipeline stage.
@ -176,8 +174,6 @@ pub struct ResolvedProgrammableStageDescriptor<'a, A: HalApi> {
    /// This is required by the WebGPU spec, but may have overhead which can be avoided
    /// for cross-platform applications
    pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }

 /// Number of implicit bind groups derived at pipeline creation.
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@ -257,7 +257,6 @@ impl<A: hal::Api> Example<A> {
                entry_point: "vs_main",
                constants: &constants,
                zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
            },
            vertex_buffers: &[],
            fragment_stage: Some(hal::ProgrammableStage {
@ -265,7 +264,6 @@ impl<A: hal::Api> Example<A> {
                entry_point: "fs_main",
                constants: &constants,
                zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
            }),
            primitive: wgt::PrimitiveState {
                topology: wgt::PrimitiveTopology::TriangleStrip,
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@ -379,7 +379,6 @@ impl<A: hal::Api> Example<A> {
                    entry_point: "main",
                    constants: &Default::default(),
                    zero_initialize_workgroup_memory: true,
-                    vertex_pulling_transform: false,
                },
                cache: None,
            })
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@ -1870,8 +1870,6 @@ pub struct ProgrammableStage<'a, A: Api> {
    /// This is required by the WebGPU spec, but may have overhead which can be avoided
    /// for cross-platform applications
    pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }

 // Rust gets confused about the impl requirements for `A`
@ -1882,7 +1880,6 @@ impl<A: Api> Clone for ProgrammableStage<'_, A> {
            entry_point: self.entry_point,
            constants: self.constants,
            zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
-            vertex_pulling_transform: self.vertex_pulling_transform,
        }
    }
 }
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@ -158,7 +158,7 @@ impl super::Device {
                metal::MTLPrimitiveTopologyClass::Point => true,
                _ => false,
            },
-            vertex_pulling_transform: stage.vertex_pulling_transform,
+            vertex_pulling_transform: true,
            vertex_buffer_mappings: vertex_buffer_mappings.to_vec(),
        };

--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@ -1180,10 +1180,6 @@ impl crate::Context for ContextWgpuCore {
                        .vertex
                        .compilation_options
                        .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: desc
-                        .vertex
-                        .compilation_options
-                        .vertex_pulling_transform,
                },
                buffers: Borrowed(&vertex_buffers),
            },
@ -1198,7 +1194,6 @@ impl crate::Context for ContextWgpuCore {
                    zero_initialize_workgroup_memory: frag
                        .compilation_options
                        .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: false,
                },
                targets: Borrowed(frag.targets),
            }),
@ -1244,7 +1239,6 @@ impl crate::Context for ContextWgpuCore {
                zero_initialize_workgroup_memory: desc
                    .compilation_options
                    .zero_initialize_workgroup_memory,
-                vertex_pulling_transform: false,
            },
            cache: desc.cache.map(|c| c.id.into()),
        };
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@ -2059,8 +2059,6 @@ pub struct PipelineCompilationOptions<'a> {
    /// This is required by the WebGPU spec, but may have overhead which can be avoided
    /// for cross-platform applications
    pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }

 impl<'a> Default for PipelineCompilationOptions<'a> {
@ -2074,7 +2072,6 @@ impl<'a> Default for PipelineCompilationOptions<'a> {
        Self {
            constants,
            zero_initialize_workgroup_memory: true,
-            vertex_pulling_transform: false,
        }
    }
 }